diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 8788dc2c059d6..8d491f6bafbbc 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -1,61 +1,61 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <32 x float> @bitcast_v32i32_to_v32f32(<32 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i32_to_v32f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i32_to_v32f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i32_to_v32f32: ; VI: ; %bb.0: @@ -225,56 +225,376 @@ end: ret <32 x float> %phi } +define inreg <32 x float> @bitcast_v32i32_to_v32f32_scalar(<32 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i32_to_v32f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v32i32_to_v32f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v32i32_to_v32f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v32i32_to_v32f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_3: +; GFX11-NEXT: .LBB1_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f32_to_v32i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f32_to_v32i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f32_to_v32i32: ; VI: ; %bb.0: @@ -286,7 +606,7 @@ define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 @@ -321,7 +641,7 @@ define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -336,7 +656,7 @@ define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 @@ -371,7 +691,7 @@ define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -388,7 +708,7 @@ define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 @@ -407,7 +727,7 @@ define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -428,56 +748,360 @@ end: ret <32 x i32> %phi } +define inreg <32 x i32> @bitcast_v32f32_to_v32i32_scalar(<32 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f32_to_v32i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v32f32_to_v32i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v32f32_to_v32i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v32f32_to_v32i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB3_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: .LBB3_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i32_to_v16i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i32_to_v16i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i32_to_v16i64: ; VI: ; %bb.0: @@ -489,7 +1113,7 @@ define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 @@ -524,7 +1148,7 @@ define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -539,7 +1163,7 @@ define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 @@ -574,7 +1198,7 @@ define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -591,7 +1215,7 @@ define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 @@ -626,7 +1250,7 @@ define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -647,56 +1271,376 @@ end: ret <16 x i64> %phi } +define inreg <16 x i64> @bitcast_v32i32_to_v16i64_scalar(<32 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i32_to_v16i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v32i32_to_v16i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v32i32_to_v16i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v32i32_to_v16i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_3: +; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i64_to_v32i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i64_to_v32i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i64_to_v32i32: ; VI: ; %bb.0: @@ -708,7 +1652,7 @@ define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -743,7 +1687,7 @@ define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -758,7 +1702,7 @@ define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -793,7 +1737,7 @@ define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -810,7 +1754,7 @@ define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -853,7 +1797,7 @@ define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -874,183 +1818,511 @@ end: ret <32 x i32> %phi } -define <16 x double> @bitcast_v32i32_to_v16f64(<32 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i32_to_v16f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <32 x i32> @bitcast_v16i64_to_v32i32_scalar(<16 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i64_to_v32i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 ; -; VI-LABEL: bitcast_v32i32_to_v16f64: +; VI-LABEL: bitcast_v16i64_to_v32i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 ; -; GFX9-LABEL: bitcast_v32i32_to_v16f64: +; GFX9-LABEL: bitcast_v16i64_to_v32i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 -; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 -; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 -; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 -; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 -; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 -; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 -; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 -; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 -; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 -; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 -; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v32i32_to_v16f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v32, vcc, 3, v32 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v16i64_to_v32i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB7_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: .LBB7_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <16 x double> @bitcast_v32i32_to_v16f64(<32 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v32i32_to_v16f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i32_to_v16f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i32_to_v16f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB8_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i32_to_v16f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 ; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 ; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 ; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 @@ -1072,7 +2344,7 @@ define <16 x double> @bitcast_v32i32_to_v16f64(<32 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1093,40 +2365,360 @@ end: ret <16 x double> %phi } +define inreg <16 x double> @bitcast_v32i32_to_v16f64_scalar(<32 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i32_to_v16f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v32i32_to_v16f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v32i32_to_v16f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v32i32_to_v16f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB9_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: .LBB9_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + define <32 x i32> @bitcast_v16f64_to_v32i32(<16 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f64_to_v32i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; GCN-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f64_to_v32i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v32i32: ; VI: ; %bb.0: @@ -1138,7 +2730,7 @@ define <32 x i32> @bitcast_v16f64_to_v32i32(<16 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 @@ -1157,7 +2749,7 @@ define <32 x i32> @bitcast_v16f64_to_v32i32(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1172,7 +2764,7 @@ define <32 x i32> @bitcast_v16f64_to_v32i32(<16 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 @@ -1191,7 +2783,7 @@ define <32 x i32> @bitcast_v16f64_to_v32i32(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1208,7 +2800,7 @@ define <32 x i32> @bitcast_v16f64_to_v32i32(<16 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 @@ -1227,7 +2819,7 @@ define <32 x i32> @bitcast_v16f64_to_v32i32(<16 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1248,1228 +2840,1478 @@ end: ret <32 x i32> %phi } +define inreg <32 x i32> @bitcast_v16f64_to_v32i32_scalar(<16 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f64_to_v32i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v33, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: v_mov_b32_e32 v19, v33 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v16f64_to_v32i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v19, v33 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v16f64_to_v32i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: v_mov_b32_e32 v19, v33 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v16f64_to_v32i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB11_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: .LBB11_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i32_to_v128i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v58 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v58 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v58 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v58 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v58 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v58 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v52 -; GCN-NEXT: v_or_b32_e32 v1, v1, v52 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_or_b32_e32 v2, v2, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v50, v31 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v31 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v50 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; GCN-NEXT: v_or_b32_e32 v31, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v61 -; GCN-NEXT: v_or_b32_e32 v49, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v48 -; GCN-NEXT: v_or_b32_e32 v2, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v60 -; GCN-NEXT: v_or_b32_e32 v61, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v39 -; GCN-NEXT: v_or_b32_e32 v62, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v57 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v38 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v37 -; GCN-NEXT: v_or_b32_e32 v7, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v47 -; GCN-NEXT: v_or_b32_e32 v8, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v36 -; GCN-NEXT: v_or_b32_e32 v9, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v46 -; GCN-NEXT: v_or_b32_e32 v10, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v35 -; GCN-NEXT: v_or_b32_e32 v11, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v45 -; GCN-NEXT: v_or_b32_e32 v12, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v34 -; GCN-NEXT: v_or_b32_e32 v13, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v44 -; GCN-NEXT: v_or_b32_e32 v14, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v33 -; GCN-NEXT: v_or_b32_e32 v15, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v43 -; GCN-NEXT: v_or_b32_e32 v16, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v21 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v17, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v42 -; GCN-NEXT: v_or_b32_e32 v18, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v23 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v19, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v41 -; GCN-NEXT: v_or_b32_e32 v20, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v25 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v21, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v40 -; GCN-NEXT: v_or_b32_e32 v22, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v27 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v23, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v55 -; GCN-NEXT: v_or_b32_e32 v24, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v29 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v25, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 -; GCN-NEXT: v_or_b32_e32 v26, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v59 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v27, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v53 -; GCN-NEXT: v_or_b32_e32 v28, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v29, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v32, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v30, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v33, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v34, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v35, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v36, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v37, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v38, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v39, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v48, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v50, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v51, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v52, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v53, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v54, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v55, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v40, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v41, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v42, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v43, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v44, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v45, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v46, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v56, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v57, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v58, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v59, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v60, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v4, v1, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v5, v1, v32 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v63, v2, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v61 -; GCN-NEXT: v_or_b32_e32 v61, v3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v6, v6, v34 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v35 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v36 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v30, v30, v37 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v38 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v39 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v8, v8, v48 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v50 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v11, v11, v52 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v12, v12, v53 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v13, v13, v54 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v55 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v15, v15, v40 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v16, v16, v41 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v17, v17, v42 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v43 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v19, v19, v44 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v45 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v21, v21, v46 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v47 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v23, v23, v56 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v57 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v25, v25, v58 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v59 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v27, v27, v60 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i32_to_v128i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v53, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v41, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v43, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v45, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v53, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v41, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v43, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v45, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v45 +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v45 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v41 +; SI-NEXT: v_or_b32_e32 v41, v41, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v61 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v58 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v47 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v39 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i32_to_v128i8: ; VI: ; %bb.0: @@ -2670,7 +4512,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill @@ -2846,9 +4688,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 -; VI-NEXT: .LBB6_2: ; %Flow +; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_4 +; VI-NEXT: s_cbranch_execz .LBB12_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 ; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 @@ -3055,7 +4897,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 -; VI-NEXT: .LBB6_4: ; %end +; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v48 ; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -3647,7 +5489,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 @@ -3841,9 +5683,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] -; GFX9-NEXT: .LBB6_2: ; %Flow +; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 @@ -4069,7 +5911,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 -; GFX9-NEXT: .LBB6_4: ; %end +; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -4517,7 +6359,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] @@ -4584,9 +6426,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 -; GFX11-TRUE16-NEXT: .LBB6_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB12_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 @@ -4686,7 +6528,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 -; GFX11-TRUE16-NEXT: .LBB6_4: ; %end +; GFX11-TRUE16-NEXT: .LBB12_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -5135,7 +6977,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -5234,9 +7076,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] -; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 @@ -5368,7 +7210,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 -; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 @@ -5710,1621 +7552,6408 @@ end: ret <128 x i8> %phi } +define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i32_to_v128i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s30, 0 +; SI-NEXT: v_writelane_b32 v41, s31, 1 +; SI-NEXT: v_writelane_b32 v41, s34, 2 +; SI-NEXT: v_writelane_b32 v41, s35, 3 +; SI-NEXT: v_writelane_b32 v41, s36, 4 +; SI-NEXT: v_writelane_b32 v41, s37, 5 +; SI-NEXT: v_writelane_b32 v41, s38, 6 +; SI-NEXT: v_writelane_b32 v41, s39, 7 +; SI-NEXT: v_writelane_b32 v41, s48, 8 +; SI-NEXT: v_writelane_b32 v41, s49, 9 +; SI-NEXT: v_writelane_b32 v41, s50, 10 +; SI-NEXT: v_writelane_b32 v41, s51, 11 +; SI-NEXT: v_writelane_b32 v41, s52, 12 +; SI-NEXT: v_writelane_b32 v41, s53, 13 +; SI-NEXT: v_writelane_b32 v41, s54, 14 +; SI-NEXT: v_writelane_b32 v41, s55, 15 +; SI-NEXT: v_writelane_b32 v41, s64, 16 +; SI-NEXT: v_writelane_b32 v41, s65, 17 +; SI-NEXT: v_writelane_b32 v41, s66, 18 +; SI-NEXT: v_writelane_b32 v41, s67, 19 +; SI-NEXT: v_writelane_b32 v41, s68, 20 +; SI-NEXT: v_writelane_b32 v41, s69, 21 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v41, s70, 22 +; SI-NEXT: v_readfirstlane_b32 s47, v1 +; SI-NEXT: v_readfirstlane_b32 s46, v2 +; SI-NEXT: v_readfirstlane_b32 s45, v3 +; SI-NEXT: v_readfirstlane_b32 s44, v4 +; SI-NEXT: v_readfirstlane_b32 s43, v5 +; SI-NEXT: v_readfirstlane_b32 s42, v6 +; SI-NEXT: v_readfirstlane_b32 s41, v7 +; SI-NEXT: v_readfirstlane_b32 s40, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v16 +; SI-NEXT: v_readfirstlane_b32 s7, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v41, s71, 23 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v22, s45 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v12, s13 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v18, s41 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_alignbit_b32 v24, s44, v22, 24 +; SI-NEXT: v_alignbit_b32 v25, s44, v22, 16 +; SI-NEXT: v_alignbit_b32 v26, s44, v22, 8 +; SI-NEXT: v_mov_b32_e32 v22, s47 +; SI-NEXT: v_mov_b32_e32 v23, s28 +; SI-NEXT: v_mov_b32_e32 v29, s26 +; SI-NEXT: v_mov_b32_e32 v35, s24 +; SI-NEXT: v_mov_b32_e32 v39, s22 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v53, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s8, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s8, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s8, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s10, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s10, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s10, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s12, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s12, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s12, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s14, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s14, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s14, v15, 8 +; SI-NEXT: v_alignbit_b32 v16, s40, v18, 24 +; SI-NEXT: v_alignbit_b32 v17, s40, v18, 16 +; SI-NEXT: v_alignbit_b32 v18, s40, v18, 8 +; SI-NEXT: v_alignbit_b32 v19, s42, v21, 24 +; SI-NEXT: v_alignbit_b32 v20, s42, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s42, v21, 8 +; SI-NEXT: v_alignbit_b32 v30, s46, v22, 24 +; SI-NEXT: v_alignbit_b32 v31, s46, v22, 16 +; SI-NEXT: v_alignbit_b32 v32, s46, v22, 8 +; SI-NEXT: v_alignbit_b32 v36, s29, v23, 24 +; SI-NEXT: v_alignbit_b32 v22, s29, v23, 16 +; SI-NEXT: v_alignbit_b32 v23, s29, v23, 8 +; SI-NEXT: v_alignbit_b32 v27, s27, v29, 24 +; SI-NEXT: v_alignbit_b32 v28, s27, v29, 16 +; SI-NEXT: v_alignbit_b32 v29, s27, v29, 8 +; SI-NEXT: v_alignbit_b32 v33, s25, v35, 24 +; SI-NEXT: v_alignbit_b32 v34, s25, v35, 16 +; SI-NEXT: v_alignbit_b32 v35, s25, v35, 8 +; SI-NEXT: v_alignbit_b32 v37, s23, v39, 24 +; SI-NEXT: v_alignbit_b32 v38, s23, v39, 16 +; SI-NEXT: v_alignbit_b32 v39, s23, v39, 8 +; SI-NEXT: v_alignbit_b32 v48, s21, v50, 24 +; SI-NEXT: v_alignbit_b32 v49, s21, v50, 16 +; SI-NEXT: v_alignbit_b32 v50, s21, v50, 8 +; SI-NEXT: v_alignbit_b32 v51, s19, v53, 24 +; SI-NEXT: v_alignbit_b32 v52, s19, v53, 16 +; SI-NEXT: v_alignbit_b32 v53, s19, v53, 8 +; SI-NEXT: v_alignbit_b32 v54, s17, v40, 24 +; SI-NEXT: v_alignbit_b32 v55, s17, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, s17, v40, 8 +; SI-NEXT: s_lshr_b32 s56, s6, 24 +; SI-NEXT: s_lshr_b32 s57, s6, 16 +; SI-NEXT: s_lshr_b32 s58, s6, 8 +; SI-NEXT: s_lshr_b32 s59, s8, 24 +; SI-NEXT: s_lshr_b32 s60, s8, 16 +; SI-NEXT: s_lshr_b32 s61, s8, 8 +; SI-NEXT: s_lshr_b32 s62, s10, 24 +; SI-NEXT: s_lshr_b32 s63, s10, 16 +; SI-NEXT: s_lshr_b32 s72, s10, 8 +; SI-NEXT: s_lshr_b32 s73, s12, 24 +; SI-NEXT: s_lshr_b32 s74, s12, 16 +; SI-NEXT: s_lshr_b32 s75, s12, 8 +; SI-NEXT: s_lshr_b32 s76, s14, 24 +; SI-NEXT: s_lshr_b32 s77, s14, 16 +; SI-NEXT: s_lshr_b32 s78, s14, 8 +; SI-NEXT: s_lshr_b32 s79, s40, 24 +; SI-NEXT: s_lshr_b32 s88, s40, 16 +; SI-NEXT: s_lshr_b32 s89, s40, 8 +; SI-NEXT: s_lshr_b32 s90, s42, 24 +; SI-NEXT: s_lshr_b32 s91, s42, 16 +; SI-NEXT: s_lshr_b32 s92, s42, 8 +; SI-NEXT: s_lshr_b32 s93, s44, 24 +; SI-NEXT: s_lshr_b32 s94, s44, 16 +; SI-NEXT: s_lshr_b32 s95, s44, 8 +; SI-NEXT: s_lshr_b32 s30, s46, 24 +; SI-NEXT: s_lshr_b32 s31, s46, 16 +; SI-NEXT: s_lshr_b32 s34, s46, 8 +; SI-NEXT: s_lshr_b32 s35, s29, 24 +; SI-NEXT: s_lshr_b32 s36, s29, 16 +; SI-NEXT: s_lshr_b32 s37, s29, 8 +; SI-NEXT: s_lshr_b32 s38, s27, 24 +; SI-NEXT: s_lshr_b32 s39, s27, 16 +; SI-NEXT: s_lshr_b32 s48, s27, 8 +; SI-NEXT: s_lshr_b32 s49, s25, 24 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s25, 8 +; SI-NEXT: s_lshr_b32 s52, s23, 24 +; SI-NEXT: s_lshr_b32 s53, s23, 16 +; SI-NEXT: s_lshr_b32 s54, s23, 8 +; SI-NEXT: s_lshr_b32 s55, s21, 24 +; SI-NEXT: s_lshr_b32 s64, s21, 16 +; SI-NEXT: s_lshr_b32 s65, s21, 8 +; SI-NEXT: s_lshr_b32 s66, s19, 24 +; SI-NEXT: s_lshr_b32 s67, s19, 16 +; SI-NEXT: s_lshr_b32 s68, s19, 8 +; SI-NEXT: s_lshr_b32 s69, s17, 24 +; SI-NEXT: s_lshr_b32 s70, s17, 16 +; SI-NEXT: s_lshr_b32 s71, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: v_mov_b32_e32 v22, s45 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v12, s13 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v18, s41 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_alignbit_b32 v24, s44, v22, 24 +; SI-NEXT: v_alignbit_b32 v25, s44, v22, 16 +; SI-NEXT: v_alignbit_b32 v26, s44, v22, 8 +; SI-NEXT: v_mov_b32_e32 v22, s47 +; SI-NEXT: v_mov_b32_e32 v23, s28 +; SI-NEXT: v_mov_b32_e32 v29, s26 +; SI-NEXT: v_mov_b32_e32 v35, s24 +; SI-NEXT: v_mov_b32_e32 v39, s22 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v53, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s8, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s8, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s8, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s10, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s10, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s10, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s12, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s12, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s12, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s14, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s14, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s14, v15, 8 +; SI-NEXT: v_alignbit_b32 v16, s40, v18, 24 +; SI-NEXT: v_alignbit_b32 v17, s40, v18, 16 +; SI-NEXT: v_alignbit_b32 v18, s40, v18, 8 +; SI-NEXT: v_alignbit_b32 v19, s42, v21, 24 +; SI-NEXT: v_alignbit_b32 v20, s42, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s42, v21, 8 +; SI-NEXT: v_alignbit_b32 v30, s46, v22, 24 +; SI-NEXT: v_alignbit_b32 v31, s46, v22, 16 +; SI-NEXT: v_alignbit_b32 v32, s46, v22, 8 +; SI-NEXT: v_alignbit_b32 v36, s29, v23, 24 +; SI-NEXT: v_alignbit_b32 v22, s29, v23, 16 +; SI-NEXT: v_alignbit_b32 v23, s29, v23, 8 +; SI-NEXT: v_alignbit_b32 v27, s27, v29, 24 +; SI-NEXT: v_alignbit_b32 v28, s27, v29, 16 +; SI-NEXT: v_alignbit_b32 v29, s27, v29, 8 +; SI-NEXT: v_alignbit_b32 v33, s25, v35, 24 +; SI-NEXT: v_alignbit_b32 v34, s25, v35, 16 +; SI-NEXT: v_alignbit_b32 v35, s25, v35, 8 +; SI-NEXT: v_alignbit_b32 v37, s23, v39, 24 +; SI-NEXT: v_alignbit_b32 v38, s23, v39, 16 +; SI-NEXT: v_alignbit_b32 v39, s23, v39, 8 +; SI-NEXT: v_alignbit_b32 v48, s21, v50, 24 +; SI-NEXT: v_alignbit_b32 v49, s21, v50, 16 +; SI-NEXT: v_alignbit_b32 v50, s21, v50, 8 +; SI-NEXT: v_alignbit_b32 v51, s19, v53, 24 +; SI-NEXT: v_alignbit_b32 v52, s19, v53, 16 +; SI-NEXT: v_alignbit_b32 v53, s19, v53, 8 +; SI-NEXT: v_alignbit_b32 v54, s17, v40, 24 +; SI-NEXT: v_alignbit_b32 v55, s17, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, s17, v40, 8 +; SI-NEXT: s_lshr_b32 s56, s6, 24 +; SI-NEXT: s_lshr_b32 s57, s6, 16 +; SI-NEXT: s_lshr_b32 s58, s6, 8 +; SI-NEXT: s_lshr_b32 s59, s8, 24 +; SI-NEXT: s_lshr_b32 s60, s8, 16 +; SI-NEXT: s_lshr_b32 s61, s8, 8 +; SI-NEXT: s_lshr_b32 s62, s10, 24 +; SI-NEXT: s_lshr_b32 s63, s10, 16 +; SI-NEXT: s_lshr_b32 s72, s10, 8 +; SI-NEXT: s_lshr_b32 s73, s12, 24 +; SI-NEXT: s_lshr_b32 s74, s12, 16 +; SI-NEXT: s_lshr_b32 s75, s12, 8 +; SI-NEXT: s_lshr_b32 s76, s14, 24 +; SI-NEXT: s_lshr_b32 s77, s14, 16 +; SI-NEXT: s_lshr_b32 s78, s14, 8 +; SI-NEXT: s_lshr_b32 s79, s40, 24 +; SI-NEXT: s_lshr_b32 s88, s40, 16 +; SI-NEXT: s_lshr_b32 s89, s40, 8 +; SI-NEXT: s_lshr_b32 s90, s42, 24 +; SI-NEXT: s_lshr_b32 s91, s42, 16 +; SI-NEXT: s_lshr_b32 s92, s42, 8 +; SI-NEXT: s_lshr_b32 s93, s44, 24 +; SI-NEXT: s_lshr_b32 s94, s44, 16 +; SI-NEXT: s_lshr_b32 s95, s44, 8 +; SI-NEXT: s_lshr_b32 s30, s46, 24 +; SI-NEXT: s_lshr_b32 s31, s46, 16 +; SI-NEXT: s_lshr_b32 s34, s46, 8 +; SI-NEXT: s_lshr_b32 s35, s29, 24 +; SI-NEXT: s_lshr_b32 s36, s29, 16 +; SI-NEXT: s_lshr_b32 s37, s29, 8 +; SI-NEXT: s_lshr_b32 s38, s27, 24 +; SI-NEXT: s_lshr_b32 s39, s27, 16 +; SI-NEXT: s_lshr_b32 s48, s27, 8 +; SI-NEXT: s_lshr_b32 s49, s25, 24 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s25, 8 +; SI-NEXT: s_lshr_b32 s52, s23, 24 +; SI-NEXT: s_lshr_b32 s53, s23, 16 +; SI-NEXT: s_lshr_b32 s54, s23, 8 +; SI-NEXT: s_lshr_b32 s55, s21, 24 +; SI-NEXT: s_lshr_b32 s64, s21, 16 +; SI-NEXT: s_lshr_b32 s65, s21, 8 +; SI-NEXT: s_lshr_b32 s66, s19, 24 +; SI-NEXT: s_lshr_b32 s67, s19, 16 +; SI-NEXT: s_lshr_b32 s68, s19, 8 +; SI-NEXT: s_lshr_b32 s69, s17, 24 +; SI-NEXT: s_lshr_b32 s70, s17, 16 +; SI-NEXT: s_lshr_b32 s71, s17, 8 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; SI-NEXT: v_or_b32_e32 v40, s4, v40 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s71, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s70, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s69, 24 +; SI-NEXT: v_and_b32_e32 v55, 0xff, v55 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v54 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_or_b32_e32 v54, v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v53 +; SI-NEXT: v_or_b32_e32 v53, s4, v53 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s68, 8 +; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s67, 0xff +; SI-NEXT: v_and_b32_e32 v40, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v51 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s66, 24 +; SI-NEXT: v_or_b32_e32 v54, v40, v54 +; SI-NEXT: v_and_b32_e32 v53, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v54, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v51, v53, v51 +; SI-NEXT: v_add_i32_e32 v52, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v55, v54, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v51, v52, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v52, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v50 +; SI-NEXT: v_or_b32_e32 v50, s4, v50 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s64, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v48 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s55, 24 +; SI-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v48, v48, v49 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v48, v50, v48 +; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v52, v51, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v48, v49, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v49, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; SI-NEXT: v_or_b32_e32 v39, s4, v39 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s5, s54, 8 +; SI-NEXT: v_and_b32_e32 v38, 0xff, v38 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s53, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v37 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s52, 24 +; SI-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v48, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v37, v39, v37 +; SI-NEXT: v_add_i32_e32 v38, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v49, v48, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v37, v38, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v38, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; SI-NEXT: v_or_b32_e32 v35, s4, v35 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s5, s51, 8 +; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s50, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s49, 24 +; SI-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v33, v35, v33 +; SI-NEXT: v_add_i32_e32 v34, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v38, v37, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v34, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; SI-NEXT: v_or_b32_e32 v29, s4, v29 +; SI-NEXT: s_and_b32 s4, s27, 0xff +; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s39, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s38, 24 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v34, v33, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_or_b32_e32 v23, s4, v23 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s37, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s36, 0xff +; SI-NEXT: buffer_store_dword v28, v27, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v36 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s35, 24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v22, v27, v22 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v32 +; SI-NEXT: v_or_b32_e32 v22, s4, v22 +; SI-NEXT: s_and_b32 s4, s46, 0xff +; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v31 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s31, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v30 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s30, 24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v23, v27, v23 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 60, v0 +; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v26 +; SI-NEXT: v_or_b32_e32 v22, s4, v22 +; SI-NEXT: s_and_b32 s4, s44, 0xff +; SI-NEXT: s_lshl_b32 s5, s95, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v25 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s94, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s93, 24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; SI-NEXT: v_or_b32_e32 v21, s4, v21 +; SI-NEXT: s_and_b32 s4, s42, 0xff +; SI-NEXT: s_lshl_b32 s5, s92, 8 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s91, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s90, 24 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v20, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_or_b32_e32 v18, s4, v18 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s5, s89, 8 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s79, 24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s77, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s14, s76, 24 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s74, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s12, s73, 24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s63, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s10, s62, 24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s10, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x64, v0 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s59, 24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s57, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s56, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x74, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s71, v41, 23 +; SI-NEXT: v_readlane_b32 s70, v41, 22 +; SI-NEXT: v_readlane_b32 s69, v41, 21 +; SI-NEXT: v_readlane_b32 s68, v41, 20 +; SI-NEXT: v_readlane_b32 s67, v41, 19 +; SI-NEXT: v_readlane_b32 s66, v41, 18 +; SI-NEXT: v_readlane_b32 s65, v41, 17 +; SI-NEXT: v_readlane_b32 s64, v41, 16 +; SI-NEXT: v_readlane_b32 s55, v41, 15 +; SI-NEXT: v_readlane_b32 s54, v41, 14 +; SI-NEXT: v_readlane_b32 s53, v41, 13 +; SI-NEXT: v_readlane_b32 s52, v41, 12 +; SI-NEXT: v_readlane_b32 s51, v41, 11 +; SI-NEXT: v_readlane_b32 s50, v41, 10 +; SI-NEXT: v_readlane_b32 s49, v41, 9 +; SI-NEXT: v_readlane_b32 s48, v41, 8 +; SI-NEXT: v_readlane_b32 s39, v41, 7 +; SI-NEXT: v_readlane_b32 s38, v41, 6 +; SI-NEXT: v_readlane_b32 s37, v41, 5 +; SI-NEXT: v_readlane_b32 s36, v41, 4 +; SI-NEXT: v_readlane_b32 s35, v41, 3 +; SI-NEXT: v_readlane_b32 s34, v41, 2 +; SI-NEXT: v_readlane_b32 s31, v41, 1 +; SI-NEXT: v_readlane_b32 s30, v41, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v32i32_to_v128i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s34, 2 +; VI-NEXT: v_writelane_b32 v20, s35, 3 +; VI-NEXT: v_writelane_b32 v20, s36, 4 +; VI-NEXT: v_writelane_b32 v20, s37, 5 +; VI-NEXT: v_writelane_b32 v20, s38, 6 +; VI-NEXT: v_writelane_b32 v20, s39, 7 +; VI-NEXT: v_writelane_b32 v20, s48, 8 +; VI-NEXT: v_writelane_b32 v20, s49, 9 +; VI-NEXT: v_writelane_b32 v20, s50, 10 +; VI-NEXT: v_writelane_b32 v20, s51, 11 +; VI-NEXT: v_writelane_b32 v20, s52, 12 +; VI-NEXT: v_writelane_b32 v20, s53, 13 +; VI-NEXT: v_writelane_b32 v20, s54, 14 +; VI-NEXT: v_writelane_b32 v20, s55, 15 +; VI-NEXT: v_writelane_b32 v20, s64, 16 +; VI-NEXT: v_writelane_b32 v20, s65, 17 +; VI-NEXT: v_writelane_b32 v20, s66, 18 +; VI-NEXT: v_writelane_b32 v20, s67, 19 +; VI-NEXT: v_writelane_b32 v20, s68, 20 +; VI-NEXT: v_writelane_b32 v20, s69, 21 +; VI-NEXT: v_writelane_b32 v20, s70, 22 +; VI-NEXT: v_writelane_b32 v20, s71, 23 +; VI-NEXT: v_writelane_b32 v20, s80, 24 +; VI-NEXT: v_writelane_b32 v20, s81, 25 +; VI-NEXT: v_writelane_b32 v20, s82, 26 +; VI-NEXT: v_writelane_b32 v20, s83, 27 +; VI-NEXT: v_writelane_b32 v20, s84, 28 +; VI-NEXT: v_writelane_b32 v20, s85, 29 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_writelane_b32 v20, s86, 30 +; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: v_readfirstlane_b32 s45, v2 +; VI-NEXT: v_readfirstlane_b32 s42, v3 +; VI-NEXT: v_readfirstlane_b32 s43, v4 +; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s41, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s15, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s11, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s9, v14 +; VI-NEXT: v_readfirstlane_b32 s6, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v16 +; VI-NEXT: v_readfirstlane_b32 s4, v17 +; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v18 +; VI-NEXT: v_writelane_b32 v20, s87, 31 +; VI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s12, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s41, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s41, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s41, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s40, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s40, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s43, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s43, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s43, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s42, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s42, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s44, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 57 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 58 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 59 +; VI-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 6 +; VI-NEXT: v_writelane_b32 v21, s61, 7 +; VI-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 4 +; VI-NEXT: v_writelane_b32 v21, s61, 5 +; VI-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 2 +; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 0 +; VI-NEXT: s_lshr_b32 s66, s27, 8 +; VI-NEXT: s_lshr_b32 s67, s26, 16 +; VI-NEXT: s_lshr_b32 s68, s26, 8 +; VI-NEXT: s_lshr_b32 s69, s25, 24 +; VI-NEXT: s_lshr_b32 s70, s25, 16 +; VI-NEXT: s_lshr_b32 s71, s25, 8 +; VI-NEXT: s_lshr_b32 s80, s24, 16 +; VI-NEXT: s_lshr_b32 s81, s24, 8 +; VI-NEXT: s_lshr_b32 s82, s23, 24 +; VI-NEXT: s_lshr_b32 s83, s23, 16 +; VI-NEXT: s_lshr_b32 s84, s23, 8 +; VI-NEXT: s_lshr_b32 s85, s22, 16 +; VI-NEXT: s_lshr_b32 s86, s22, 8 +; VI-NEXT: s_lshr_b32 s87, s21, 24 +; VI-NEXT: s_lshr_b32 s50, s21, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 8 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s20, 8 +; VI-NEXT: s_lshr_b32 s57, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s58, s17, 24 +; VI-NEXT: s_lshr_b32 s59, s17, 16 +; VI-NEXT: s_lshr_b32 s55, s17, 8 +; VI-NEXT: s_lshr_b32 s64, s16, 16 +; VI-NEXT: s_lshr_b32 s65, s16, 8 +; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s5, 3 +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: s_add_i32 s4, s4, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s12, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s41, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s41, 16 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s41, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s40, 16 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s40, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s43, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s43, 16 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s43, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s42, 16 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s42, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s44, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 57 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 58 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 59 +; VI-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 6 +; VI-NEXT: v_writelane_b32 v21, s61, 7 +; VI-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 4 +; VI-NEXT: v_writelane_b32 v21, s61, 5 +; VI-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 2 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: s_lshr_b32 s66, s27, 8 +; VI-NEXT: s_lshr_b32 s67, s26, 16 +; VI-NEXT: s_lshr_b32 s68, s26, 8 +; VI-NEXT: s_lshr_b32 s69, s25, 24 +; VI-NEXT: s_lshr_b32 s70, s25, 16 +; VI-NEXT: s_lshr_b32 s71, s25, 8 +; VI-NEXT: s_lshr_b32 s80, s24, 16 +; VI-NEXT: s_lshr_b32 s81, s24, 8 +; VI-NEXT: s_lshr_b32 s82, s23, 24 +; VI-NEXT: s_lshr_b32 s83, s23, 16 +; VI-NEXT: s_lshr_b32 s84, s23, 8 +; VI-NEXT: s_lshr_b32 s85, s22, 16 +; VI-NEXT: s_lshr_b32 s86, s22, 8 +; VI-NEXT: s_lshr_b32 s87, s21, 24 +; VI-NEXT: s_lshr_b32 s50, s21, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 8 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s20, 8 +; VI-NEXT: s_lshr_b32 s57, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s58, s17, 24 +; VI-NEXT: s_lshr_b32 s59, s17, 16 +; VI-NEXT: s_lshr_b32 s55, s17, 8 +; VI-NEXT: s_lshr_b32 s64, s16, 16 +; VI-NEXT: s_lshr_b32 s65, s16, 8 +; VI-NEXT: v_writelane_b32 v21, s60, 0 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_lshl_b32 s61, s65, 8 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_or_b32 s16, s16, s61 +; VI-NEXT: s_lshl_b32 s61, s48, 8 +; VI-NEXT: s_and_b32 s63, s64, 0xff +; VI-NEXT: s_or_b32 s61, s63, s61 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s61, s61, 16 +; VI-NEXT: s_or_b32 s16, s16, s61 +; VI-NEXT: v_mov_b32_e32 v1, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xff +; VI-NEXT: s_lshl_b32 s17, s55, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s59, 0xff +; VI-NEXT: s_lshl_b32 s58, s58, 8 +; VI-NEXT: s_or_b32 s17, s17, s58 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_lshl_b32 s16, s54, 8 +; VI-NEXT: s_and_b32 s17, s18, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s38, 8 +; VI-NEXT: s_and_b32 s18, s53, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xff +; VI-NEXT: s_lshl_b32 s17, s52, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s51, 0xff +; VI-NEXT: s_lshl_b32 s18, s57, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: s_lshl_b32 s16, s56, 8 +; VI-NEXT: s_and_b32 s17, s20, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s36, 8 +; VI-NEXT: s_and_b32 s18, s47, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v5, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xff +; VI-NEXT: s_lshl_b32 s17, s46, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s50, 0xff +; VI-NEXT: s_lshl_b32 s18, s87, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v6, s16 +; VI-NEXT: s_lshl_b32 s16, s86, 8 +; VI-NEXT: s_and_b32 s17, s22, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s34, 8 +; VI-NEXT: s_and_b32 s18, s85, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v7, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xff +; VI-NEXT: s_lshl_b32 s17, s84, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s83, 0xff +; VI-NEXT: s_lshl_b32 s18, s82, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v8, s16 +; VI-NEXT: s_lshl_b32 s16, s81, 8 +; VI-NEXT: s_and_b32 s17, s24, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s30, 8 +; VI-NEXT: s_and_b32 s18, s80, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xff +; VI-NEXT: s_lshl_b32 s17, s71, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s70, 0xff +; VI-NEXT: s_lshl_b32 s18, s69, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: s_lshl_b32 s16, s68, 8 +; VI-NEXT: s_and_b32 s17, s26, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s90, 8 +; VI-NEXT: s_and_b32 s18, s67, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v11, s16 +; VI-NEXT: s_and_b32 s16, s27, 0xff +; VI-NEXT: s_lshl_b32 s17, s66, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 59 +; VI-NEXT: v_readlane_b32 s18, v21, 58 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v12, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 57 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s28, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 56 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s88, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 55 +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: s_and_b32 s16, s29, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 54 +; VI-NEXT: v_readlane_b32 s18, v21, 53 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v14, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 52 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s44, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 51 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s78, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 50 +; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: v_mov_b32_e32 v15, s16 +; VI-NEXT: s_and_b32 s16, s45, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: v_readlane_b32 s17, v21, 49 +; VI-NEXT: v_readlane_b32 s18, v21, 48 +; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 47 +; VI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s42, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 46 +; VI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s76, 8 +; VI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 45 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s43, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 44 +; VI-NEXT: v_readlane_b32 s18, v21, 43 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 64, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 42 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s40, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 41 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s74, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 40 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s41, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 39 +; VI-NEXT: v_readlane_b32 s18, v21, 38 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x48, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 37 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 36 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_lshl_b32 s17, s72, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x4c, v0 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s14, s15, 0xff +; VI-NEXT: v_readlane_b32 s15, v21, 35 +; VI-NEXT: s_lshl_b32 s15, s15, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: v_readlane_b32 s15, v21, 34 +; VI-NEXT: v_readlane_b32 s16, v21, 33 +; VI-NEXT: s_and_b32 s15, s15, 0xff +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x50, v0 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_readlane_b32 s14, v21, 32 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s14, s14, 8 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: v_readlane_b32 s14, v21, 31 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s15, s62, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x54, v0 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s12, s13, 0xff +; VI-NEXT: v_readlane_b32 s13, v21, 30 +; VI-NEXT: s_lshl_b32 s13, s13, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: v_readlane_b32 s13, v21, 29 +; VI-NEXT: v_readlane_b32 s14, v21, 28 +; VI-NEXT: s_and_b32 s13, s13, 0xff +; VI-NEXT: s_lshl_b32 s14, s14, 8 +; VI-NEXT: s_or_b32 s13, s13, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x58, v0 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: v_readlane_b32 s12, v21, 27 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s12, s12, 8 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: v_readlane_b32 s12, v21, 26 +; VI-NEXT: v_readlane_b32 s14, v21, 0 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s13, s14, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v0 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s10, s11, 0xff +; VI-NEXT: v_readlane_b32 s11, v21, 25 +; VI-NEXT: s_lshl_b32 s11, s11, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: v_readlane_b32 s11, v21, 24 +; VI-NEXT: v_readlane_b32 s12, v21, 23 +; VI-NEXT: s_and_b32 s11, s11, 0xff +; VI-NEXT: s_lshl_b32 s12, s12, 8 +; VI-NEXT: s_or_b32 s11, s11, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x60, v0 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_readlane_b32 s10, v21, 22 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: v_readlane_b32 s10, v21, 21 +; VI-NEXT: v_readlane_b32 s12, v21, 2 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s11, s12, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x64, v0 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s8, s9, 0xff +; VI-NEXT: v_readlane_b32 s9, v21, 20 +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: v_readlane_b32 s9, v21, 19 +; VI-NEXT: v_readlane_b32 s10, v21, 18 +; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x68, v0 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_readlane_b32 s8, v21, 17 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_readlane_b32 s8, v21, 16 +; VI-NEXT: v_readlane_b32 s10, v21, 4 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v0 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_and_b32 s6, s7, 0xff +; VI-NEXT: v_readlane_b32 s7, v21, 15 +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_readlane_b32 s7, v21, 14 +; VI-NEXT: v_readlane_b32 s8, v21, 13 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x70, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_readlane_b32 s6, v21, 12 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_readlane_b32 s6, v21, 11 +; VI-NEXT: v_readlane_b32 s8, v21, 6 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s7, s8, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x74, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: v_readlane_b32 s5, v21, 10 +; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_readlane_b32 s5, v21, 9 +; VI-NEXT: v_readlane_b32 s6, v21, 8 +; VI-NEXT: s_and_b32 s5, s5, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_readlane_b32 s15, v21, 1 +; VI-NEXT: v_readlane_b32 s13, v21, 3 +; VI-NEXT: v_readlane_b32 s11, v21, 5 +; VI-NEXT: v_readlane_b32 s9, v21, 7 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s87, v20, 31 +; VI-NEXT: v_readlane_b32 s86, v20, 30 +; VI-NEXT: v_readlane_b32 s85, v20, 29 +; VI-NEXT: v_readlane_b32 s84, v20, 28 +; VI-NEXT: v_readlane_b32 s83, v20, 27 +; VI-NEXT: v_readlane_b32 s82, v20, 26 +; VI-NEXT: v_readlane_b32 s81, v20, 25 +; VI-NEXT: v_readlane_b32 s80, v20, 24 +; VI-NEXT: v_readlane_b32 s71, v20, 23 +; VI-NEXT: v_readlane_b32 s70, v20, 22 +; VI-NEXT: v_readlane_b32 s69, v20, 21 +; VI-NEXT: v_readlane_b32 s68, v20, 20 +; VI-NEXT: v_readlane_b32 s67, v20, 19 +; VI-NEXT: v_readlane_b32 s66, v20, 18 +; VI-NEXT: v_readlane_b32 s65, v20, 17 +; VI-NEXT: v_readlane_b32 s64, v20, 16 +; VI-NEXT: v_readlane_b32 s55, v20, 15 +; VI-NEXT: v_readlane_b32 s54, v20, 14 +; VI-NEXT: v_readlane_b32 s53, v20, 13 +; VI-NEXT: v_readlane_b32 s52, v20, 12 +; VI-NEXT: v_readlane_b32 s51, v20, 11 +; VI-NEXT: v_readlane_b32 s50, v20, 10 +; VI-NEXT: v_readlane_b32 s49, v20, 9 +; VI-NEXT: v_readlane_b32 s48, v20, 8 +; VI-NEXT: v_readlane_b32 s39, v20, 7 +; VI-NEXT: v_readlane_b32 s38, v20, 6 +; VI-NEXT: v_readlane_b32 s37, v20, 5 +; VI-NEXT: v_readlane_b32 s36, v20, 4 +; VI-NEXT: v_readlane_b32 s35, v20, 3 +; VI-NEXT: v_readlane_b32 s34, v20, 2 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: v_writelane_b32 v21, s60, 0 +; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: v_writelane_b32 v21, s60, 2 +; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: v_writelane_b32 v21, s60, 4 +; VI-NEXT: v_writelane_b32 v21, s61, 5 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: v_writelane_b32 v21, s60, 6 +; VI-NEXT: v_writelane_b32 v21, s61, 7 +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v32i32_to_v128i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_writelane_b32 v20, s34, 2 +; GFX9-NEXT: v_writelane_b32 v20, s35, 3 +; GFX9-NEXT: v_writelane_b32 v20, s36, 4 +; GFX9-NEXT: v_writelane_b32 v20, s37, 5 +; GFX9-NEXT: v_writelane_b32 v20, s38, 6 +; GFX9-NEXT: v_writelane_b32 v20, s39, 7 +; GFX9-NEXT: v_writelane_b32 v20, s48, 8 +; GFX9-NEXT: v_writelane_b32 v20, s49, 9 +; GFX9-NEXT: v_writelane_b32 v20, s50, 10 +; GFX9-NEXT: v_writelane_b32 v20, s51, 11 +; GFX9-NEXT: v_writelane_b32 v20, s52, 12 +; GFX9-NEXT: v_writelane_b32 v20, s53, 13 +; GFX9-NEXT: v_writelane_b32 v20, s54, 14 +; GFX9-NEXT: v_writelane_b32 v20, s55, 15 +; GFX9-NEXT: v_writelane_b32 v20, s64, 16 +; GFX9-NEXT: v_writelane_b32 v20, s65, 17 +; GFX9-NEXT: v_writelane_b32 v20, s66, 18 +; GFX9-NEXT: v_writelane_b32 v20, s67, 19 +; GFX9-NEXT: v_writelane_b32 v20, s68, 20 +; GFX9-NEXT: v_writelane_b32 v20, s69, 21 +; GFX9-NEXT: v_writelane_b32 v20, s70, 22 +; GFX9-NEXT: v_writelane_b32 v20, s71, 23 +; GFX9-NEXT: v_writelane_b32 v20, s80, 24 +; GFX9-NEXT: v_writelane_b32 v20, s81, 25 +; GFX9-NEXT: v_writelane_b32 v20, s82, 26 +; GFX9-NEXT: v_writelane_b32 v20, s83, 27 +; GFX9-NEXT: v_writelane_b32 v20, s84, 28 +; GFX9-NEXT: v_writelane_b32 v20, s85, 29 +; GFX9-NEXT: v_writelane_b32 v20, s86, 30 +; GFX9-NEXT: v_writelane_b32 v20, s87, 31 +; GFX9-NEXT: v_writelane_b32 v20, s96, 32 +; GFX9-NEXT: v_writelane_b32 v20, s97, 33 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_writelane_b32 v20, s98, 34 +; GFX9-NEXT: v_readfirstlane_b32 s44, v1 +; GFX9-NEXT: v_readfirstlane_b32 s45, v2 +; GFX9-NEXT: v_readfirstlane_b32 s42, v3 +; GFX9-NEXT: v_readfirstlane_b32 s43, v4 +; GFX9-NEXT: v_readfirstlane_b32 s40, v5 +; GFX9-NEXT: v_readfirstlane_b32 s41, v6 +; GFX9-NEXT: v_readfirstlane_b32 s14, v7 +; GFX9-NEXT: v_readfirstlane_b32 s15, v8 +; GFX9-NEXT: v_readfirstlane_b32 s12, v9 +; GFX9-NEXT: v_readfirstlane_b32 s13, v10 +; GFX9-NEXT: v_readfirstlane_b32 s10, v11 +; GFX9-NEXT: v_readfirstlane_b32 s11, v12 +; GFX9-NEXT: v_readfirstlane_b32 s8, v13 +; GFX9-NEXT: v_readfirstlane_b32 s9, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: v_readfirstlane_b32 s7, v16 +; GFX9-NEXT: v_readfirstlane_b32 s4, v17 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v18 +; GFX9-NEXT: v_writelane_b32 v20, s99, 35 +; GFX9-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s5, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 2 +; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 3 +; GFX9-NEXT: s_lshr_b32 s46, s5, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 4 +; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 5 +; GFX9-NEXT: s_lshr_b32 s46, s4, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 6 +; GFX9-NEXT: s_lshr_b32 s46, s7, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 7 +; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 8 +; GFX9-NEXT: s_lshr_b32 s46, s7, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 9 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 10 +; GFX9-NEXT: s_lshr_b32 s46, s6, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 11 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 12 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s8, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s8, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s11, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s11, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s10, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s10, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s13, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s13, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s12, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s12, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s15, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s15, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s14, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s14, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s41, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s41, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s40, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s40, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s43, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s43, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s43, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s42, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s42, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s45, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 42 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 43 +; GFX9-NEXT: s_lshr_b32 s46, s45, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 44 +; GFX9-NEXT: s_lshr_b32 s46, s44, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 45 +; GFX9-NEXT: s_lshr_b32 s46, s44, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 46 +; GFX9-NEXT: s_lshr_b32 s46, s29, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 47 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s29, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 49 +; GFX9-NEXT: s_lshr_b32 s46, s28, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 50 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 +; GFX9-NEXT: v_writelane_b32 v21, s56, 0 +; GFX9-NEXT: s_lshr_b32 s82, s28, 8 +; GFX9-NEXT: s_lshr_b32 s83, s27, 24 +; GFX9-NEXT: s_lshr_b32 s81, s27, 16 +; GFX9-NEXT: s_lshr_b32 s84, s27, 8 +; GFX9-NEXT: s_lshr_b32 s85, s26, 16 +; GFX9-NEXT: s_lshr_b32 s86, s26, 8 +; GFX9-NEXT: s_lshr_b32 s87, s25, 24 +; GFX9-NEXT: s_lshr_b32 s96, s25, 16 +; GFX9-NEXT: s_lshr_b32 s97, s25, 8 +; GFX9-NEXT: s_lshr_b32 s98, s24, 16 +; GFX9-NEXT: s_lshr_b32 s99, s24, 8 +; GFX9-NEXT: s_lshr_b32 s38, s23, 24 +; GFX9-NEXT: s_lshr_b32 s39, s23, 16 +; GFX9-NEXT: s_lshr_b32 s48, s23, 8 +; GFX9-NEXT: s_lshr_b32 s49, s22, 16 +; GFX9-NEXT: s_lshr_b32 s50, s22, 8 +; GFX9-NEXT: s_lshr_b32 s51, s21, 24 +; GFX9-NEXT: s_lshr_b32 s52, s21, 16 +; GFX9-NEXT: s_lshr_b32 s53, s21, 8 +; GFX9-NEXT: s_lshr_b32 s54, s20, 16 +; GFX9-NEXT: s_lshr_b32 s55, s20, 8 +; GFX9-NEXT: s_lshr_b32 s64, s19, 24 +; GFX9-NEXT: s_lshr_b32 s65, s19, 16 +; GFX9-NEXT: s_lshr_b32 s66, s19, 8 +; GFX9-NEXT: s_lshr_b32 s67, s18, 16 +; GFX9-NEXT: s_lshr_b32 s68, s18, 8 +; GFX9-NEXT: s_lshr_b32 s69, s17, 24 +; GFX9-NEXT: s_lshr_b32 s70, s17, 16 +; GFX9-NEXT: s_lshr_b32 s71, s17, 8 +; GFX9-NEXT: s_lshr_b32 s80, s16, 16 +; GFX9-NEXT: s_lshr_b32 s46, s16, 8 +; GFX9-NEXT: v_writelane_b32 v21, s57, 1 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s5, s5, 3 +; GFX9-NEXT: s_lshr_b32 s46, s5, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 2 +; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: s_add_i32 s4, s4, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 3 +; GFX9-NEXT: s_lshr_b32 s46, s5, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 4 +; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 5 +; GFX9-NEXT: s_lshr_b32 s46, s4, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 6 +; GFX9-NEXT: s_lshr_b32 s46, s7, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 7 +; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 8 +; GFX9-NEXT: s_lshr_b32 s46, s7, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 9 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 10 +; GFX9-NEXT: s_lshr_b32 s46, s6, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 11 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 12 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s8, 16 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s8, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s11, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s11, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s10, 16 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s10, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s13, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s13, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s12, 16 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s12, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s15, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s15, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s14, 16 +; GFX9-NEXT: s_add_i32 s41, s41, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s14, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s41, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_add_i32 s40, s40, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s41, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s40, 16 +; GFX9-NEXT: s_add_i32 s43, s43, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s40, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s43, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s43, 16 +; GFX9-NEXT: s_add_i32 s42, s42, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s43, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s42, 16 +; GFX9-NEXT: s_add_i32 s45, s45, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s42, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s45, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 42 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_add_i32 s44, s44, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 43 +; GFX9-NEXT: s_lshr_b32 s46, s45, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 44 +; GFX9-NEXT: s_lshr_b32 s46, s44, 16 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 45 +; GFX9-NEXT: s_lshr_b32 s46, s44, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 46 +; GFX9-NEXT: s_lshr_b32 s46, s29, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 47 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s29, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 49 +; GFX9-NEXT: s_lshr_b32 s46, s28, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 50 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: v_writelane_b32 v21, s56, 0 +; GFX9-NEXT: s_lshr_b32 s82, s28, 8 +; GFX9-NEXT: s_lshr_b32 s83, s27, 24 +; GFX9-NEXT: s_lshr_b32 s81, s27, 16 +; GFX9-NEXT: s_lshr_b32 s84, s27, 8 +; GFX9-NEXT: s_lshr_b32 s85, s26, 16 +; GFX9-NEXT: s_lshr_b32 s86, s26, 8 +; GFX9-NEXT: s_lshr_b32 s87, s25, 24 +; GFX9-NEXT: s_lshr_b32 s96, s25, 16 +; GFX9-NEXT: s_lshr_b32 s97, s25, 8 +; GFX9-NEXT: s_lshr_b32 s98, s24, 16 +; GFX9-NEXT: s_lshr_b32 s99, s24, 8 +; GFX9-NEXT: s_lshr_b32 s38, s23, 24 +; GFX9-NEXT: s_lshr_b32 s39, s23, 16 +; GFX9-NEXT: s_lshr_b32 s48, s23, 8 +; GFX9-NEXT: s_lshr_b32 s49, s22, 16 +; GFX9-NEXT: s_lshr_b32 s50, s22, 8 +; GFX9-NEXT: s_lshr_b32 s51, s21, 24 +; GFX9-NEXT: s_lshr_b32 s52, s21, 16 +; GFX9-NEXT: s_lshr_b32 s53, s21, 8 +; GFX9-NEXT: s_lshr_b32 s54, s20, 16 +; GFX9-NEXT: s_lshr_b32 s55, s20, 8 +; GFX9-NEXT: s_lshr_b32 s64, s19, 24 +; GFX9-NEXT: s_lshr_b32 s65, s19, 16 +; GFX9-NEXT: s_lshr_b32 s66, s19, 8 +; GFX9-NEXT: s_lshr_b32 s67, s18, 16 +; GFX9-NEXT: s_lshr_b32 s68, s18, 8 +; GFX9-NEXT: s_lshr_b32 s69, s17, 24 +; GFX9-NEXT: s_lshr_b32 s70, s17, 16 +; GFX9-NEXT: s_lshr_b32 s71, s17, 8 +; GFX9-NEXT: s_lshr_b32 s80, s16, 16 +; GFX9-NEXT: s_lshr_b32 s46, s16, 8 +; GFX9-NEXT: v_writelane_b32 v21, s57, 1 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_lshl_b32 s46, s46, 8 +; GFX9-NEXT: s_and_b32 s16, s16, 0xff +; GFX9-NEXT: s_or_b32 s16, s16, s46 +; GFX9-NEXT: s_lshl_b32 s46, s36, 8 +; GFX9-NEXT: s_and_b32 s47, s80, 0xff +; GFX9-NEXT: s_or_b32 s46, s47, s46 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s46, s46, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s46 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s71, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s70, 0xff +; GFX9-NEXT: s_lshl_b32 s46, s69, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s46 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: s_lshl_b32 s16, s68, 8 +; GFX9-NEXT: s_and_b32 s17, s18, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s34, 8 +; GFX9-NEXT: s_and_b32 s18, s67, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s16 +; GFX9-NEXT: s_and_b32 s16, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s66, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s65, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s64, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_lshl_b32 s16, s55, 8 +; GFX9-NEXT: s_and_b32 s17, s20, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s30, 8 +; GFX9-NEXT: s_and_b32 s18, s54, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s16 +; GFX9-NEXT: s_and_b32 s16, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s53, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s51, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s16 +; GFX9-NEXT: s_lshl_b32 s16, s50, 8 +; GFX9-NEXT: s_and_b32 s17, s22, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s94, 8 +; GFX9-NEXT: s_and_b32 s18, s49, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v7, s16 +; GFX9-NEXT: s_and_b32 s16, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s48, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s38, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-NEXT: s_lshl_b32 s16, s99, 8 +; GFX9-NEXT: s_and_b32 s17, s24, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s92, 8 +; GFX9-NEXT: s_and_b32 s18, s98, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v9, s16 +; GFX9-NEXT: s_and_b32 s16, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s97, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s96, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s87, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: s_lshl_b32 s16, s86, 8 +; GFX9-NEXT: s_and_b32 s17, s26, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s90, 8 +; GFX9-NEXT: s_and_b32 s18, s85, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v11, s16 +; GFX9-NEXT: s_and_b32 s16, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s84, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s81, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s83, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: s_lshl_b32 s16, s82, 8 +; GFX9-NEXT: s_and_b32 s17, s28, 0xff +; GFX9-NEXT: v_readlane_b32 s18, v21, 50 +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s88, 8 +; GFX9-NEXT: s_and_b32 s18, s18, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 49 +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: s_and_b32 s16, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 48 +; GFX9-NEXT: v_readlane_b32 s18, v21, 47 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 46 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s44, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 45 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s78, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 44 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s45, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 43 +; GFX9-NEXT: v_readlane_b32 s18, v21, 42 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 41 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s42, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 40 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s76, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 39 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s43, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 38 +; GFX9-NEXT: v_readlane_b32 s18, v21, 37 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 36 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s40, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 35 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s74, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 34 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s41, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 33 +; GFX9-NEXT: v_readlane_b32 s18, v21, 32 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: v_readlane_b32 s16, v21, 31 +; GFX9-NEXT: s_and_b32 s14, s14, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s16, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s16 +; GFX9-NEXT: v_readlane_b32 s16, v21, 30 +; GFX9-NEXT: s_and_b32 s16, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s72, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s16, s16, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s16 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s14, s15, 0xff +; GFX9-NEXT: v_readlane_b32 s15, v21, 29 +; GFX9-NEXT: s_lshl_b32 s15, s15, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: v_readlane_b32 s15, v21, 28 +; GFX9-NEXT: v_readlane_b32 s16, v21, 27 +; GFX9-NEXT: s_and_b32 s15, s15, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s16, 8 +; GFX9-NEXT: s_or_b32 s15, s15, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s15, s15, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: v_readlane_b32 s14, v21, 26 +; GFX9-NEXT: s_and_b32 s12, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s14, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s14 +; GFX9-NEXT: v_readlane_b32 s14, v21, 25 +; GFX9-NEXT: s_and_b32 s14, s14, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s62, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s14, s14, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s14 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_and_b32 s12, s13, 0xff +; GFX9-NEXT: v_readlane_b32 s13, v21, 24 +; GFX9-NEXT: s_lshl_b32 s13, s13, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: v_readlane_b32 s13, v21, 23 +; GFX9-NEXT: v_readlane_b32 s14, v21, 22 +; GFX9-NEXT: s_and_b32 s13, s13, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s14, 8 +; GFX9-NEXT: s_or_b32 s13, s13, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s13, s13, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_readlane_b32 s12, v21, 21 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s12, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s12 +; GFX9-NEXT: v_readlane_b32 s12, v21, 20 +; GFX9-NEXT: s_and_b32 s12, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s60, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s12, s12, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s12 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_and_b32 s10, s11, 0xff +; GFX9-NEXT: v_readlane_b32 s11, v21, 19 +; GFX9-NEXT: s_lshl_b32 s11, s11, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: v_readlane_b32 s11, v21, 18 +; GFX9-NEXT: v_readlane_b32 s12, v21, 17 +; GFX9-NEXT: s_and_b32 s11, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s12, 8 +; GFX9-NEXT: s_or_b32 s11, s11, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s11, s11, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_readlane_b32 s10, v21, 16 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s10 +; GFX9-NEXT: v_readlane_b32 s10, v21, 15 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s58, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NEXT: s_or_b32 s8, s8, s10 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s8, s9, 0xff +; GFX9-NEXT: v_readlane_b32 s9, v21, 14 +; GFX9-NEXT: s_lshl_b32 s9, s9, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: v_readlane_b32 s9, v21, 13 +; GFX9-NEXT: v_readlane_b32 s10, v21, 12 +; GFX9-NEXT: s_and_b32 s9, s9, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_readlane_b32 s8, v21, 11 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_readlane_b32 s8, v21, 10 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s56, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s7, 0xff +; GFX9-NEXT: v_readlane_b32 s7, v21, 9 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_readlane_b32 s7, v21, 8 +; GFX9-NEXT: v_readlane_b32 s8, v21, 7 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_readlane_b32 s6, v21, 6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: v_readlane_b32 s6, v21, 5 +; GFX9-NEXT: v_readlane_b32 s8, v21, 0 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s8, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_and_b32 s4, s5, 0xff +; GFX9-NEXT: v_readlane_b32 s5, v21, 4 +; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_readlane_b32 s5, v21, 3 +; GFX9-NEXT: v_readlane_b32 s6, v21, 2 +; GFX9-NEXT: s_and_b32 s5, s5, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_readlane_b32 s9, v21, 1 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: v_readlane_b32 s99, v20, 35 +; GFX9-NEXT: v_readlane_b32 s98, v20, 34 +; GFX9-NEXT: v_readlane_b32 s97, v20, 33 +; GFX9-NEXT: v_readlane_b32 s96, v20, 32 +; GFX9-NEXT: v_readlane_b32 s87, v20, 31 +; GFX9-NEXT: v_readlane_b32 s86, v20, 30 +; GFX9-NEXT: v_readlane_b32 s85, v20, 29 +; GFX9-NEXT: v_readlane_b32 s84, v20, 28 +; GFX9-NEXT: v_readlane_b32 s83, v20, 27 +; GFX9-NEXT: v_readlane_b32 s82, v20, 26 +; GFX9-NEXT: v_readlane_b32 s81, v20, 25 +; GFX9-NEXT: v_readlane_b32 s80, v20, 24 +; GFX9-NEXT: v_readlane_b32 s71, v20, 23 +; GFX9-NEXT: v_readlane_b32 s70, v20, 22 +; GFX9-NEXT: v_readlane_b32 s69, v20, 21 +; GFX9-NEXT: v_readlane_b32 s68, v20, 20 +; GFX9-NEXT: v_readlane_b32 s67, v20, 19 +; GFX9-NEXT: v_readlane_b32 s66, v20, 18 +; GFX9-NEXT: v_readlane_b32 s65, v20, 17 +; GFX9-NEXT: v_readlane_b32 s64, v20, 16 +; GFX9-NEXT: v_readlane_b32 s55, v20, 15 +; GFX9-NEXT: v_readlane_b32 s54, v20, 14 +; GFX9-NEXT: v_readlane_b32 s53, v20, 13 +; GFX9-NEXT: v_readlane_b32 s52, v20, 12 +; GFX9-NEXT: v_readlane_b32 s51, v20, 11 +; GFX9-NEXT: v_readlane_b32 s50, v20, 10 +; GFX9-NEXT: v_readlane_b32 s49, v20, 9 +; GFX9-NEXT: v_readlane_b32 s48, v20, 8 +; GFX9-NEXT: v_readlane_b32 s39, v20, 7 +; GFX9-NEXT: v_readlane_b32 s38, v20, 6 +; GFX9-NEXT: v_readlane_b32 s37, v20, 5 +; GFX9-NEXT: v_readlane_b32 s36, v20, 4 +; GFX9-NEXT: v_readlane_b32 s35, v20, 3 +; GFX9-NEXT: v_readlane_b32 s34, v20, 2 +; GFX9-NEXT: v_readlane_b32 s31, v20, 1 +; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: v_writelane_b32 v21, s82, 0 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr99 +; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr96 +; GFX9-NEXT: ; implicit-def: $sgpr87 +; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr85 +; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: v_writelane_b32 v21, s83, 1 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32i32_to_v128i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v16, s32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v17, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v18, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v19, s32 offset:12 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s30, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s96, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s97, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s98, 2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s99, 3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s100, 4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s101, 5 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s102, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s103, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s104, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s50, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s51, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s52, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s53, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s54, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s55, 15 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s64, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s65, 17 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s66, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s67, 19 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s68, 20 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s69, 21 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s70, 22 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s71, 23 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s80, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s81, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s82, 26 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s83, 27 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s84, 28 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s85, 29 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s86, 30 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s87, 31 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[4:5], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s15, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s29, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s28, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 9 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s27, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s27, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 7 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s26, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s25, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 26 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 5 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[10:11], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s25, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 25 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s24, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s22, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s21, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s19, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s18, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 19 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s17, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s1, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s0, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[12:13], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[14:15], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[72:73], s[28:29], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[26:27], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[20:21], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 11 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s57 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[4:5], 24 +; GFX11-TRUE16-NEXT: s_add_i32 s14, s14, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s40, s40, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 9 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 6 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 26 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 7 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 25 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 4 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 5 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[10:11], 24 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 2 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 19 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s15, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s29, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s28, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s27, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s25, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s24, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s22, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s21, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 11 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s19, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s18, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s17, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s1, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s0, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[12:13], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[14:15], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[72:73], s[28:29], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[26:27], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[20:21], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s99 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s30 +; GFX11-TRUE16-NEXT: s_and_b32 s57, s57, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s56, s57, s56 +; GFX11-TRUE16-NEXT: s_lshl_b32 s47, s47, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s47 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s94 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s98 +; GFX11-TRUE16-NEXT: s_and_b32 s46, s46, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s47, s47, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s46, s46, s47 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s97 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s96 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s46, s46, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s56, s56, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s57, s57, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s87 +; GFX11-TRUE16-NEXT: s_or_b32 s56, s56, s57 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s46, s46, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s86 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s56 +; GFX11-TRUE16-NEXT: s_and_b32 s46, s46, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s45, s45, 8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s45, s46, s45 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s45, s45, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s45 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s92 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s85 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s84 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s43 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s83 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s82 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s18 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s42 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s71 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s90 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s81 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s16, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s0 :: v_dual_mov_b32 v8, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s70 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s80 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s2 :: v_dual_mov_b32 v10, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s88 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s69 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s68 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s66 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s67 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s64 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s22 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s65 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s55 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s76 +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s53 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s23 +; GFX11-TRUE16-NEXT: s_lshl_b32 s18, s18, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s54 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s17, s17, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s17, s17, s18 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s16, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s17, 16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[3:6], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s52 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s51 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s78 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s50 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s49 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s48 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s39 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s38 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s74 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s37 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s36 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s35 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s18, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s19, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-TRUE16-NEXT: s_or_b32 s17, s18, s19 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s34 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s104 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s72 +; GFX11-TRUE16-NEXT: v_readlane_b32 s17, v18, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s103 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s102 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s16, v18, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s101 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s62 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s17, v18, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s18, v18, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s19, v18, 4 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s18, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s19, s19, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-TRUE16-NEXT: s_or_b32 s17, s18, s19 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s1, v18, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s2, v18, 6 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s60 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s14, v18, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s100 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s15, s15, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s14, s15 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s12, v18, 10 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s58 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s12, s12, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s12, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s13, v18, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s14, v18, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s15, v18, 13 +; GFX11-TRUE16-NEXT: s_and_b32 s12, s12, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s13, s13, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s15, s15, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s12, s12, s13 +; GFX11-TRUE16-NEXT: s_or_b32 s13, s14, s15 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s12, s12, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s13, s13, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s12, s13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s12, v19, 2 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s1, v18, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s2, v18, 15 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s12 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v18, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v18, 18 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s10, s11 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s8, v18, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v19, 4 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v19, 5 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s8, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s9, v18, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v18, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v18, 23 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s8, s9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s8, v19, 6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s1, v18, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s2, v18, 25 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s6, v18, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s7, v18, 28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s6, s7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s4, v18, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s6, v19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: v_readlane_b32 s7, v19, 9 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s5, v18, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s6, v19, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s7, v19, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s13, v19, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s9, v19, 7 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:112 +; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v17, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v17, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v17, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v17, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v17, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v17, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v17, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v17, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v17, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v16, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v16, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v16, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v16, 28 +; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v16, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v16, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v16, 25 +; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v16, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v16, 23 +; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v16, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v16, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v16, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v16, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v16, 18 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v16, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v16, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v16, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v16, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v16, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v16, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v16, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v16, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v16, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v16, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v16, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v16, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v16, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v16, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v16, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v16, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v16, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v16, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_load_b32 v16, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v17, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v18, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v19, off, s32 offset:12 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 10 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 11 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 12 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 13 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 14 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 15 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 17 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 18 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 19 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 20 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 21 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 22 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 23 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 24 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 25 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 26 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 27 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 28 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 29 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 30 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 31 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v16, s32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v17, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v18, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v19, s32 offset:12 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s30, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s96, 0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s97, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s98, 2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s99, 3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s100, 4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s101, 5 +; GFX11-FAKE16-NEXT: s_mov_b32 s101, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s102, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s103, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s104, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s49, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s50, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s51, 11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s52, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s53, 13 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s54, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s55, 15 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s64, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s65, 17 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s66, 18 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s67, 19 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s68, 20 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s69, 21 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s70, 22 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s71, 23 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s80, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s81, 25 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s82, 26 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s83, 27 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s84, 28 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s85, 29 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s86, 30 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s87, 31 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s5, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s5, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s7, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s7, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s6, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s9, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s9, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s8, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s11, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s10, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s13, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s13, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s12, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s12, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s15, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s15, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s15, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s14, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s41, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s41, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s41, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s40, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s40, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s29, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s28, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s28, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s27, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s26, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s25, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s2, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 15 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s1, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s1, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 13 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 11 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 9 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 6 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 7 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 5 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 1 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_branch .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: s_mov_b32 s101, -1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 0 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 1 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 2 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 3 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 5 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 7 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 9 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 11 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 13 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 15 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s101 +; GFX11-FAKE16-NEXT: s_mov_b32 s101, s104 +; GFX11-FAKE16-NEXT: s_mov_b32 s104, s57 +; GFX11-FAKE16-NEXT: s_mov_b32 s57, s69 +; GFX11-FAKE16-NEXT: s_mov_b32 s69, s42 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_5 +; GFX11-FAKE16-NEXT: ; %bb.4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s15, s15, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s14, s14, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s41, s41, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-FAKE16-NEXT: s_add_i32 s40, s40, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s5, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s5, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s7, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s7, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s9, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s9, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s8, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s11, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s11, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 15 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s10, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s10, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s13, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 13 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s13, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s13, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s12, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 11 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s12, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s15, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s15, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 9 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s15, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s14, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s14, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s41, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s41, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s41, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 7 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s40, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s40, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s29, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s28, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 5 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s27, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s25, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 1 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 9 +; GFX11-FAKE16-NEXT: .LBB13_5: ; %end +; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s42, s74, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s43 +; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s94, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s42, s42, s43 +; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s45, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s42 +; GFX11-FAKE16-NEXT: v_readlane_b32 s42, v18, 9 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s30, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s44, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s44, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s42 +; GFX11-FAKE16-NEXT: v_readlane_b32 s42, v18, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s43, v18, 7 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s44 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s100, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s98, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s44 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s99, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s42, s42, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s44, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s42, s42, s43 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s44 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v18, 6 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s42 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v18, 2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s92, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v18, 0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 29 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v17, 4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v17, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v19, 28 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s18, v19, 19 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s90, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 31 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s19, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 30 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s78, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s18, s18, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s86, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 21 +; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v17, 2 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 22 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 26 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v16, 30 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 25 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v16, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v16, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s1, v19, 18 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s62, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 20 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s16, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s17, s17, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s17, s17, s18 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s17, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s97, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 17 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s88, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s69, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s18, s72, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v17, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v16, 21 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s25, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s73, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s96, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s76, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s27, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s87, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-FAKE16-NEXT: s_or_b32 s17, s18, s19 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s85, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s84, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s16, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 1 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s29, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s83, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s82, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s81, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s18, v19, 2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s40, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s61, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s80, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s18, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s19, v19, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s41, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s60, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s18, s71, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s70, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-FAKE16-NEXT: s_or_b32 s17, s18, s19 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s58, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s59, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s16, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s15, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s68, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s14, s67, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s66, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s14, s15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s14, v19, 6 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s12, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s65, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s12, s64, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s15, v19, 7 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s12, s14 +; GFX11-FAKE16-NEXT: s_and_b32 s12, s13, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s13, s55, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s14, s54, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s53, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s12, s12, s13 +; GFX11-FAKE16-NEXT: s_or_b32 s13, s14, s15 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s12, s12, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s13, s13, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s12, s13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s12, v19, 8 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s10, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s52, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s51, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s12, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s11, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s50, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s49, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s48, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s10, s11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s10, v19, 10 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s8, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s39, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s38, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s11, v19, 11 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s8, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s37, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s36, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s35, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s8, s9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s8, v19, 12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s6, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s56, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s57, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s7, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s34, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_hi, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s46, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s6, s7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s6, v19, 14 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s4, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s47, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s104, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s7, v19, 15 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s5, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s103, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s102, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s101, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:64 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s13, v19, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s9, v19, 13 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 +; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v17, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v17, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v17, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v17, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v17, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v16, 31 +; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v16, 29 +; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v16, 28 +; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v16, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v16, 26 +; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v16, 25 +; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v16, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v16, 23 +; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v16, 22 +; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v16, 20 +; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v16, 19 +; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v16, 18 +; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v16, 17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v16, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v16, 15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v16, 14 +; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v16, 13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v16, 12 +; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v16, 11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v16, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v16, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v16, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v16, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v16, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v16, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v16, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v16, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v16, 2 +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v16, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v17, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v18, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v19, off, s32 offset:12 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v128i8_to_v32i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:388 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v60, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v58, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v62, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v61, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v34, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v1 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:372 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v39 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v56 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v47 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v54 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v46 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v37 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v48 -; GCN-NEXT: v_or_b32_e32 v8, v8, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v9, v53 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v10, v42 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v41 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v40 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v63 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v50 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v18, v51 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v19, v49 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v20, v60 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v21, v58 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v22, v22, v62 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_or_b32_e32 v23, v23, v59 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_or_b32_e32 v25, v25, v61 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_or_b32_e32 v26, v26, v34 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v27, v27, v52 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_or_b32_e32 v28, v28, v35 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v33 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v30, v30, v44 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v57 -; GCN-NEXT: v_or_b32_e32 v31, v31, v36 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v36, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v37, v38, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v48, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v55, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v54, v49 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v50 -; GCN-NEXT: v_or_b32_e32 v19, v19, v51 -; GCN-NEXT: v_or_b32_e32 v20, v20, v52 -; GCN-NEXT: v_or_b32_e32 v21, v21, v53 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v23, v23, v33 -; GCN-NEXT: v_or_b32_e32 v24, v24, v34 -; GCN-NEXT: v_or_b32_e32 v25, v25, v35 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v37 -; GCN-NEXT: v_or_b32_e32 v28, v28, v38 -; GCN-NEXT: v_or_b32_e32 v29, v29, v39 -; GCN-NEXT: v_or_b32_e32 v30, v30, v48 -; GCN-NEXT: v_or_b32_e32 v31, v31, v49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB7_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v39, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v56, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v47, v3 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v54, v4 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v46, v5 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v37, v6 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v45, v8 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v53, v9 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v42, v10 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v41, v11 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v40, v12 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v63, v13 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v50, v14 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v0, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v0, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v0, v17 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v51, v18 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v49, v19 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v60, v20 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v58, v21 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v25, v62, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v29, v59, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v37, v32, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v50, v61, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v41, v34, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v45, v52, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v56, v35, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v58, v33, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v59, v44, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v57 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v57, v36, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v60, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v61, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v54, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v32, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v48, v35 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v53, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_mov_b32_e32 v0, v55 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v55, v53 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v55, v40, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v40, v42, v40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v44, v43 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v44, v46, v44 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v0, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v47, v0, v47 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v60, v0 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v61, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v63, v3 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v21 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s7, v37 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s7, v50 -; GCN-NEXT: v_add_i32_e32 v41, vcc, s7, v41 -; GCN-NEXT: v_add_i32_e32 v45, vcc, s7, v45 -; GCN-NEXT: v_add_i32_e32 v56, vcc, s7, v56 -; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v59 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x300, v57 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v56 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 -; GCN-NEXT: v_or_b32_e32 v4, v36, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v39, v6 -; GCN-NEXT: v_or_b32_e32 v7, v49, v7 -; GCN-NEXT: v_or_b32_e32 v8, v51, v8 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: v_or_b32_e32 v10, v54, v10 -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: v_or_b32_e32 v12, v23, v12 -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: v_or_b32_e32 v15, v27, v15 -; GCN-NEXT: v_or_b32_e32 v16, v28, v16 -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: v_or_b32_e32 v18, v31, v18 -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: v_or_b32_e32 v20, v33, v20 -; GCN-NEXT: v_or_b32_e32 v21, v34, v21 -; GCN-NEXT: v_or_b32_e32 v22, v35, v25 -; GCN-NEXT: v_or_b32_e32 v23, v48, v29 -; GCN-NEXT: v_or_b32_e32 v24, v53, v37 -; GCN-NEXT: v_or_b32_e32 v25, v55, v50 -; GCN-NEXT: v_or_b32_e32 v26, v40, v41 -; GCN-NEXT: v_or_b32_e32 v27, v42, v45 -; GCN-NEXT: v_or_b32_e32 v28, v43, v56 -; GCN-NEXT: v_or_b32_e32 v29, v44, v58 -; GCN-NEXT: v_or_b32_e32 v30, v46, v59 -; GCN-NEXT: v_or_b32_e32 v31, v47, v57 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 -; GCN-NEXT: .LBB7_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v128i8_to_v32i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v46 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v40 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v62 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v57 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v44 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v36 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v46, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v32i32: ; VI: ; %bb.0: @@ -7656,7 +14285,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload @@ -8129,9 +14758,9 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_4 +; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload @@ -8520,7 +15149,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v31, v32, v31 -; VI-NEXT: .LBB7_4: ; %end +; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -8892,7 +15521,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload @@ -9366,9 +15995,9 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload @@ -9763,7 +16392,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 -; GFX9-NEXT: .LBB7_4: ; %end +; GFX9-NEXT: .LBB14_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -10005,15 +16634,15 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB7_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB7_4 -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_4 +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB7_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h @@ -10371,8 +17000,8 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 -; GFX11-TRUE16-NEXT: .LBB7_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 @@ -10994,7 +17623,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v54 @@ -11349,9 +17978,9 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: .LBB7_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v55, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v54, 3 @@ -11706,7 +18335,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 -; GFX11-FAKE16-NEXT: .LBB7_4: ; %end +; GFX11-FAKE16-NEXT: .LBB14_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 @@ -11780,1589 +18409,7703 @@ end: ret <32 x i32> %phi } -define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i32_to_v64bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v62 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v59 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v56, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v46, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v45, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v44, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 20, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_alignbit_b32 v32, v32, v33, 16 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_alignbit_b32 v34, v34, v35, 16 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_alignbit_b32 v36, v36, v37, 16 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_alignbit_b32 v38, v38, v39, 16 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_alignbit_b32 v48, v48, v49, 16 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_alignbit_b32 v50, v50, v51, 16 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_alignbit_b32 v52, v52, v53, 16 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_alignbit_b32 v54, v54, v55, 16 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_alignbit_b32 v40, v40, v41, 16 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v63 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_alignbit_b32 v42, v42, v43, 16 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v56, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v128i8_to_v32i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB15_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: v_mov_b32_e32 v43, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_mov_b32_e32 v55, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_mov_b32_e32 v44, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 +; SI-NEXT: v_mov_b32_e32 v59, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_mov_b32_e32 v54, v37 +; SI-NEXT: v_mov_b32_e32 v37, v41 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v38, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB15_3 +; SI-NEXT: .LBB15_2: +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_mov_b32_e32 v54, v37 +; SI-NEXT: v_mov_b32_e32 v37, v41 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: .LBB15_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v63, v46 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB15_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB15_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v32i32_to_v64bf16: +; VI-LABEL: bitcast_v128i8_to_v32i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v15 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:124 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:172 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:180 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:204 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:240 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:212 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:236 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:252 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:268 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB15_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v44, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v59 +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v53, v63 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v15 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v47, v39 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v52, v48 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v45, v36 +; VI-NEXT: v_mov_b32_e32 v40, v21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v57, v1 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v49, v38 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v23, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 +; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v58, v2 +; VI-NEXT: v_mov_b32_e32 v32, v36 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v46 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v42, v48 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_mov_b32_e32 v61, v39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB15_3 +; VI-NEXT: .LBB15_2: +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v53, v63 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v45, v36 +; VI-NEXT: v_mov_b32_e32 v47, v39 +; VI-NEXT: v_mov_b32_e32 v49, v38 +; VI-NEXT: v_mov_b32_e32 v44, v24 +; VI-NEXT: v_mov_b32_e32 v40, v21 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: v_mov_b32_e32 v43, v59 +; VI-NEXT: v_mov_b32_e32 v52, v48 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: .LBB15_3: ; %Flow +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB15_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v41 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v35 +; VI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 24, v52 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_or_b32_e32 v30, v30, v31 +; VI-NEXT: v_lshlrev_b32_e32 v28, 24, v53 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s9, s17, 8 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s18, 0xff +; VI-NEXT: s_lshl_b32 s8, s19, 24 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s21, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s9, s20, 0xff +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s22, 0xff +; VI-NEXT: s_lshl_b32 s6, s23, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s9 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s5, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v63 +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s8, s8, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: .LBB8_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v42 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v44 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v40 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v47 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v56 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v61 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v33 +; VI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v48, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v27, 24, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v50 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v43 +; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v33, v51, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v26, 24, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v49, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v50, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v32 +; VI-NEXT: v_or_b32_sdwa v25, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_and_b32_e32 v33, 0xff, v49 +; VI-NEXT: v_or_b32_sdwa v32, v58, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_or_b32_sdwa v26, v26, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v33, 0xff, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_and_b32_e32 v33, 0xff, v39 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_and_b32_e32 v33, 0xff, v37 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v28, v28, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v30, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: .LBB15_5: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v32i32_to_v64bf16: +; GFX9-LABEL: bitcast_v128i8_to_v32i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v4 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(42) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB15_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v38, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v61, v38 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v63, v57 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v37, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v60 +; GFX9-NEXT: v_mov_b32_e32 v52, v56 +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_mov_b32_e32 v34, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v53, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB15_3 +; GFX9-NEXT: .LBB15_2: +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_mov_b32_e32 v63, v57 +; GFX9-NEXT: v_mov_b32_e32 v53, v3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: v_mov_b32_e32 v57, v38 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: .LBB15_3: ; %Flow +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 +; GFX9-NEXT: ; %bb.4: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v61 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 -; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 -; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 -; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 -; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 -; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 -; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 -; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 -; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 -; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 -; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 -; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: .LBB8_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v32i32_to_v64bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: .LBB8_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <32 x i32> %a, splat (i32 3) - %a2 = bitcast <32 x i32> %a1 to <64 x bfloat> - br label %end - -cmp.false: - %a3 = bitcast <32 x i32> %a to <64 x bfloat> - br label %end - -end: - %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <64 x bfloat> %phi -} - -define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v64bf16_to_v32i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v33 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v42 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v63 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v0 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; GCN-NEXT: v_alignbit_b32 v0, v0, v32, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v61, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; GCN-NEXT: v_alignbit_b32 v2, v2, v59, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; GCN-NEXT: v_alignbit_b32 v3, v3, v57, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v46 -; GCN-NEXT: v_alignbit_b32 v4, v4, v47, 16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v5, v45, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v33, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v43 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v19, v19, v42, 16 -; GCN-NEXT: v_alignbit_b32 v20, v20, v44, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; GCN-NEXT: v_alignbit_b32 v22, v22, v48, 16 -; GCN-NEXT: v_alignbit_b32 v23, v23, v38, 16 -; GCN-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; GCN-NEXT: v_alignbit_b32 v25, v25, v51, 16 -; GCN-NEXT: v_alignbit_b32 v26, v26, v53, 16 -; GCN-NEXT: v_alignbit_b32 v27, v27, v55, 16 -; GCN-NEXT: v_alignbit_b32 v28, v28, v40, 16 -; GCN-NEXT: v_alignbit_b32 v29, v29, v63, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; kill: killed $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v46 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v34 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v42 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v44 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v63 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v54 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v41 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v43 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v40, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v44, 0x40c00000, v31 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v32 -; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v34 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v36 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v38 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v48 -; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v49 -; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v50 -; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v51 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v52 -; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; GCN-NEXT: v_alignbit_b32 v19, v21, v20, 16 -; GCN-NEXT: v_alignbit_b32 v20, v39, v54, 16 -; GCN-NEXT: v_alignbit_b32 v21, v48, v40, 16 -; GCN-NEXT: v_alignbit_b32 v22, v49, v22, 16 -; GCN-NEXT: v_alignbit_b32 v23, v50, v23, 16 -; GCN-NEXT: v_alignbit_b32 v24, v51, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v32, v25, 16 -; GCN-NEXT: v_alignbit_b32 v26, v33, v26, 16 -; GCN-NEXT: v_alignbit_b32 v27, v34, v27, 16 -; GCN-NEXT: v_alignbit_b32 v28, v35, v28, 16 -; GCN-NEXT: v_alignbit_b32 v29, v36, v29, 16 -; GCN-NEXT: v_alignbit_b32 v30, v37, v30, 16 -; GCN-NEXT: v_alignbit_b32 v31, v38, v31, 16 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v64bf16_to_v32i32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v63 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v46 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB15_5: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:436 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64 +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v22, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v27, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v30, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB15_3: ; %end +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:436 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 +; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v53 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v91 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v2, 0xff, v52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v55 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v54 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v52 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v90, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v92, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v93, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v88, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v74, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v75, v12 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v49 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v79, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v63, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v72, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v73, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v117, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v113, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v34, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: .LBB15_3: ; %end +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 +; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v32i32_to_v64bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v63 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i32_to_v64bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: .LBB16_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i32_to_v64bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: .LBB16_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i32_to_v64bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: .LBB16_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i32_to_v64bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s36, 4 +; SI-NEXT: v_writelane_b32 v20, s37, 5 +; SI-NEXT: v_writelane_b32 v20, s38, 6 +; SI-NEXT: v_writelane_b32 v20, s39, 7 +; SI-NEXT: v_writelane_b32 v20, s48, 8 +; SI-NEXT: v_writelane_b32 v20, s49, 9 +; SI-NEXT: v_writelane_b32 v20, s50, 10 +; SI-NEXT: v_writelane_b32 v20, s51, 11 +; SI-NEXT: v_writelane_b32 v20, s52, 12 +; SI-NEXT: v_writelane_b32 v20, s53, 13 +; SI-NEXT: v_writelane_b32 v20, s54, 14 +; SI-NEXT: v_writelane_b32 v20, s55, 15 +; SI-NEXT: v_writelane_b32 v20, s64, 16 +; SI-NEXT: v_writelane_b32 v20, s65, 17 +; SI-NEXT: v_writelane_b32 v20, s66, 18 +; SI-NEXT: v_writelane_b32 v20, s67, 19 +; SI-NEXT: v_writelane_b32 v20, s68, 20 +; SI-NEXT: v_writelane_b32 v20, s69, 21 +; SI-NEXT: v_writelane_b32 v20, s70, 22 +; SI-NEXT: v_writelane_b32 v20, s71, 23 +; SI-NEXT: v_writelane_b32 v20, s80, 24 +; SI-NEXT: v_writelane_b32 v20, s81, 25 +; SI-NEXT: v_writelane_b32 v20, s82, 26 +; SI-NEXT: v_writelane_b32 v20, s83, 27 +; SI-NEXT: v_writelane_b32 v20, s84, 28 +; SI-NEXT: v_writelane_b32 v20, s85, 29 +; SI-NEXT: v_writelane_b32 v20, s86, 30 +; SI-NEXT: v_writelane_b32 v20, s87, 31 +; SI-NEXT: v_writelane_b32 v20, s96, 32 +; SI-NEXT: v_writelane_b32 v20, s97, 33 +; SI-NEXT: v_writelane_b32 v20, s98, 34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v20, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s70, v1 +; SI-NEXT: v_readfirstlane_b32 s71, v2 +; SI-NEXT: v_readfirstlane_b32 s80, v3 +; SI-NEXT: v_readfirstlane_b32 s81, v4 +; SI-NEXT: v_readfirstlane_b32 s82, v5 +; SI-NEXT: v_readfirstlane_b32 s83, v6 +; SI-NEXT: v_readfirstlane_b32 s84, v7 +; SI-NEXT: v_readfirstlane_b32 s85, v8 +; SI-NEXT: v_readfirstlane_b32 s86, v9 +; SI-NEXT: v_readfirstlane_b32 s87, v10 +; SI-NEXT: v_readfirstlane_b32 s96, v11 +; SI-NEXT: v_readfirstlane_b32 s97, v12 +; SI-NEXT: v_readfirstlane_b32 s98, v13 +; SI-NEXT: v_readfirstlane_b32 s99, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s8, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; SI-NEXT: s_cbranch_scc0 .LBB17_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v21, s4, 1 +; SI-NEXT: s_lshl_b32 s4, s17, 16 +; SI-NEXT: v_writelane_b32 v21, s4, 0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; SI-NEXT: v_writelane_b32 v21, s4, 2 +; SI-NEXT: s_lshl_b32 s4, s16, 16 +; SI-NEXT: s_and_b32 s11, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s9, 16 +; SI-NEXT: s_and_b32 s13, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s8, 16 +; SI-NEXT: s_and_b32 s15, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s7, 16 +; SI-NEXT: s_and_b32 s41, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s6, 16 +; SI-NEXT: s_and_b32 s43, s99, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s99, 16 +; SI-NEXT: s_and_b32 s45, s98, 0xffff0000 +; SI-NEXT: s_lshl_b32 s44, s98, 16 +; SI-NEXT: s_and_b32 s47, s97, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s97, 16 +; SI-NEXT: s_and_b32 s57, s96, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s96, 16 +; SI-NEXT: s_and_b32 s59, s87, 0xffff0000 +; SI-NEXT: s_lshl_b32 s58, s87, 16 +; SI-NEXT: s_and_b32 s61, s86, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s86, 16 +; SI-NEXT: s_and_b32 s63, s85, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s85, 16 +; SI-NEXT: s_and_b32 s73, s84, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s84, 16 +; SI-NEXT: s_and_b32 s75, s83, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s83, 16 +; SI-NEXT: s_and_b32 s77, s82, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s82, 16 +; SI-NEXT: s_and_b32 s79, s81, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s81, 16 +; SI-NEXT: s_and_b32 s89, s80, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s80, 16 +; SI-NEXT: s_and_b32 s91, s71, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s71, 16 +; SI-NEXT: s_and_b32 s93, s70, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s70, 16 +; SI-NEXT: s_and_b32 s95, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s29, 16 +; SI-NEXT: s_and_b32 s31, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s30, s28, 16 +; SI-NEXT: s_and_b32 s35, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s34, s27, 16 +; SI-NEXT: s_and_b32 s37, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s26, 16 +; SI-NEXT: s_and_b32 s39, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s38, s25, 16 +; SI-NEXT: s_and_b32 s49, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s48, s24, 16 +; SI-NEXT: s_and_b32 s51, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s50, s23, 16 +; SI-NEXT: s_and_b32 s53, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s52, s22, 16 +; SI-NEXT: s_and_b32 s55, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s54, s21, 16 +; SI-NEXT: s_and_b32 s65, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s20, 16 +; SI-NEXT: s_and_b32 s67, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s66, s19, 16 +; SI-NEXT: s_and_b32 s69, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s68, s18, 16 +; SI-NEXT: v_writelane_b32 v21, s4, 3 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB17_3 +; SI-NEXT: .LBB17_2: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: .LBB17_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_mov_b32 s4, s10 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_mov_b32 s10, s12 +; SI-NEXT: s_mov_b32 s11, s13 +; SI-NEXT: s_mov_b32 s12, s14 +; SI-NEXT: s_mov_b32 s13, s15 +; SI-NEXT: s_mov_b32 s14, s40 +; SI-NEXT: s_mov_b32 s15, s41 +; SI-NEXT: s_mov_b32 s40, s42 +; SI-NEXT: s_mov_b32 s41, s43 +; SI-NEXT: s_mov_b32 s42, s44 +; SI-NEXT: s_mov_b32 s43, s45 +; SI-NEXT: s_mov_b32 s44, s46 +; SI-NEXT: s_mov_b32 s45, s47 +; SI-NEXT: s_mov_b32 s46, s56 +; SI-NEXT: s_mov_b32 s47, s57 +; SI-NEXT: s_mov_b32 s56, s58 +; SI-NEXT: s_mov_b32 s57, s59 +; SI-NEXT: s_mov_b32 s58, s60 +; SI-NEXT: s_mov_b32 s59, s61 +; SI-NEXT: s_mov_b32 s60, s62 +; SI-NEXT: s_mov_b32 s61, s63 +; SI-NEXT: s_mov_b32 s62, s72 +; SI-NEXT: s_mov_b32 s63, s73 +; SI-NEXT: s_mov_b32 s72, s74 +; SI-NEXT: s_mov_b32 s73, s75 +; SI-NEXT: s_mov_b32 s74, s76 +; SI-NEXT: v_readlane_b32 s75, v21, 0 +; SI-NEXT: v_readlane_b32 s76, v21, 1 +; SI-NEXT: s_cbranch_vccnz .LBB17_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s70, s70, 3 +; SI-NEXT: s_add_i32 s71, s71, 3 +; SI-NEXT: s_add_i32 s80, s80, 3 +; SI-NEXT: s_add_i32 s81, s81, 3 +; SI-NEXT: s_add_i32 s82, s82, 3 +; SI-NEXT: s_add_i32 s83, s83, 3 +; SI-NEXT: s_add_i32 s84, s84, 3 +; SI-NEXT: s_add_i32 s85, s85, 3 +; SI-NEXT: s_add_i32 s86, s86, 3 +; SI-NEXT: s_add_i32 s87, s87, 3 +; SI-NEXT: s_add_i32 s96, s96, 3 +; SI-NEXT: s_add_i32 s97, s97, 3 +; SI-NEXT: s_add_i32 s98, s98, 3 +; SI-NEXT: s_add_i32 s99, s99, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_and_b32 s15, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s6, 16 +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_and_b32 s5, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s4, s9, 16 +; SI-NEXT: s_and_b32 s11, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s8, 16 +; SI-NEXT: s_and_b32 s13, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s7, 16 +; SI-NEXT: s_and_b32 s41, s99, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s99, 16 +; SI-NEXT: s_and_b32 s43, s98, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s98, 16 +; SI-NEXT: s_and_b32 s45, s97, 0xffff0000 +; SI-NEXT: s_lshl_b32 s44, s97, 16 +; SI-NEXT: s_and_b32 s47, s96, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s96, 16 +; SI-NEXT: s_and_b32 s57, s87, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s87, 16 +; SI-NEXT: s_and_b32 s59, s86, 0xffff0000 +; SI-NEXT: s_lshl_b32 s58, s86, 16 +; SI-NEXT: s_and_b32 s61, s85, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s85, 16 +; SI-NEXT: s_and_b32 s63, s84, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s84, 16 +; SI-NEXT: s_and_b32 s73, s83, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s83, 16 +; SI-NEXT: s_and_b32 s77, s82, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s82, 16 +; SI-NEXT: s_and_b32 s79, s81, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s81, 16 +; SI-NEXT: s_and_b32 s89, s80, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s80, 16 +; SI-NEXT: s_and_b32 s91, s71, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s71, 16 +; SI-NEXT: s_and_b32 s93, s70, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s70, 16 +; SI-NEXT: s_and_b32 s95, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s29, 16 +; SI-NEXT: s_and_b32 s31, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s30, s28, 16 +; SI-NEXT: s_and_b32 s35, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s34, s27, 16 +; SI-NEXT: s_and_b32 s37, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s26, 16 +; SI-NEXT: s_and_b32 s39, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s38, s25, 16 +; SI-NEXT: s_and_b32 s49, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s48, s24, 16 +; SI-NEXT: s_and_b32 s51, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s50, s23, 16 +; SI-NEXT: s_and_b32 s53, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s52, s22, 16 +; SI-NEXT: s_and_b32 s55, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s54, s21, 16 +; SI-NEXT: s_and_b32 s65, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s20, 16 +; SI-NEXT: s_and_b32 s67, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s66, s19, 16 +; SI-NEXT: s_and_b32 s69, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s68, s18, 16 +; SI-NEXT: s_and_b32 s76, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s17, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v21, s6, 2 +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: v_writelane_b32 v21, s6, 3 +; SI-NEXT: .LBB17_5: ; %end +; SI-NEXT: v_readlane_b32 s6, v21, 2 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v21, 3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s68 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s66 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s54 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s52 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s38 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s36 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s94 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s62 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s58 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s56 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s46 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s44 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s40 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s14 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s12 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s10 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s99, v20, 35 +; SI-NEXT: v_readlane_b32 s98, v20, 34 +; SI-NEXT: v_readlane_b32 s97, v20, 33 +; SI-NEXT: v_readlane_b32 s96, v20, 32 +; SI-NEXT: v_readlane_b32 s87, v20, 31 +; SI-NEXT: v_readlane_b32 s86, v20, 30 +; SI-NEXT: v_readlane_b32 s85, v20, 29 +; SI-NEXT: v_readlane_b32 s84, v20, 28 +; SI-NEXT: v_readlane_b32 s83, v20, 27 +; SI-NEXT: v_readlane_b32 s82, v20, 26 +; SI-NEXT: v_readlane_b32 s81, v20, 25 +; SI-NEXT: v_readlane_b32 s80, v20, 24 +; SI-NEXT: v_readlane_b32 s71, v20, 23 +; SI-NEXT: v_readlane_b32 s70, v20, 22 +; SI-NEXT: v_readlane_b32 s69, v20, 21 +; SI-NEXT: v_readlane_b32 s68, v20, 20 +; SI-NEXT: v_readlane_b32 s67, v20, 19 +; SI-NEXT: v_readlane_b32 s66, v20, 18 +; SI-NEXT: v_readlane_b32 s65, v20, 17 +; SI-NEXT: v_readlane_b32 s64, v20, 16 +; SI-NEXT: v_readlane_b32 s55, v20, 15 +; SI-NEXT: v_readlane_b32 s54, v20, 14 +; SI-NEXT: v_readlane_b32 s53, v20, 13 +; SI-NEXT: v_readlane_b32 s52, v20, 12 +; SI-NEXT: v_readlane_b32 s51, v20, 11 +; SI-NEXT: v_readlane_b32 s50, v20, 10 +; SI-NEXT: v_readlane_b32 s49, v20, 9 +; SI-NEXT: v_readlane_b32 s48, v20, 8 +; SI-NEXT: v_readlane_b32 s39, v20, 7 +; SI-NEXT: v_readlane_b32 s38, v20, 6 +; SI-NEXT: v_readlane_b32 s37, v20, 5 +; SI-NEXT: v_readlane_b32 s36, v20, 4 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i32_to_v64bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v32i32_to_v64bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v32i32_to_v64bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB17_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_3: +; GFX11-NEXT: .LBB17_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { +; SI-LABEL: bitcast_v64bf16_to_v32i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 +; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16 +; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16 +; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16 +; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16 +; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v32i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 @@ -13370,570 +26113,3418 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: .LBB18_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 +; GFX9-NEXT: .LBB18_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v32.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v32, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v14, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v38, v13, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v12, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v32, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v11, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v33 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v10, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v9, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v8, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v34 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v31, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v31.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v30, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v30 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v31, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v30.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v30, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v29, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v28, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v28, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v28 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v29.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v27, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v29, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v28.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v27, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v27.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v26, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v27, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v26, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v25, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v24, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v26, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v25.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v24, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v24 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v25, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v24.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v24, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v23, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v23.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v21, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v23, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v22.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v21, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v21.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v18 +; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v21, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v19, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v20.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v39, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v19.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v18, 0x7fff +; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v20, v32 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v16 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v19, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v17, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v37, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v16, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v16, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v17.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v18, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v17, v35 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v36, v16 +; GFX11-TRUE16-NEXT: .LBB18_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v32, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v39, v14, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v13, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v12, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v11, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v8, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v9, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v6, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v6, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v7, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v30, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v29 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v30, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v29, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v28, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v27, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v26, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v23, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v21, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v21, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v38, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v19, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v18, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v17, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v17 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v36, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v16, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 +; GFX11-FAKE16-NEXT: .LBB18_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64bf16_to_v32i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v14, v11 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: v_mov_b32_e32 v57, v11 +; SI-NEXT: v_mov_b32_e32 v47, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 +; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 +; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 +; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v35, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v11 +; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v56, v11 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v12 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v14 +; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v14 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v53, v38 +; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_mov_b32_e32 v57, v11 +; SI-NEXT: v_mov_b32_e32 v47, v10 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v64bf16_to_v32i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: s_movk_i32 s6, 0x7fff ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -13941,1629 +29532,6384 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 -; VI-NEXT: .LBB9_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: s_branch .LBB19_2 ; -; GFX9-LABEL: bitcast_v64bf16_to_v32i32: +; GFX9-LABEL: bitcast_v64bf16_to_v32i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v15 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; GFX9-NEXT: s_mov_b32 s7, 0x7060302 -; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff +; GFX9-NEXT: v_and_b32_sdwa v15, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v33, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v14 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v14, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v14, v33, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v13 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v33, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v12 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v12, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v33, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v11 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v33, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v10 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v10, v18, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v33, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v9 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v33, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v8 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v8, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v33, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v7 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v7, v18, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v33, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v6 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v5 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v5, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v33, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v4 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v33, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v3 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v33, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v2 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v2, v33, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v1 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v33, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v31 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v31 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v31, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v31, v33, 16, v31 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v30 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v30, v18, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v29 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v29 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v29, v18, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v29, v33, 16, v29 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v28 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v28 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v28 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v28, v18, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v28, v33, 16, v28 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v27 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v27 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v27 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v27, v18, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v27, v33, 16, v27 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v26 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v26 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v26, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v26, v33, 16, v26 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v25 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v25 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v25 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v25, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v25, v33, 16, v25 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v24 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v24 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v24 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v24, v18, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v24, v33, 16, v24 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v23 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v23 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v23 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v23, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v23, v33, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v22 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v22 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v22, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v22, v33, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v21 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v21 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v21 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v21, v18, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v21, v33, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v20 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v20 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v20 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v20, v33, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v19 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v19 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v19, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v19, v33, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v32 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v32 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v32, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v32, v33, 16, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v17, v33, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 -; GFX9-NEXT: .LBB9_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v16, v18, 16, v16 +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: s_branch .LBB19_2 ; -; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32: +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v33, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v32.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v15, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37 -; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v14 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v32, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v14, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v33 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v38, v13, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 +; GFX11-TRUE16-NEXT: s_clause 0x8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v183, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v182, v9 :: v_dual_mov_b32 v169, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v176, v3 :: v_dual_mov_b32 v175, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v173, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s5, s27, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s27, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v12, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v32, v35, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v11, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v11, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v33 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v10, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v10, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v32 -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v9, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v8, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v8, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v34 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 +; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v23, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v24, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v26, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 +; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 +; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v30, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v31, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v31.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v30, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v30 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v29 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v31, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v30.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v29, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v30, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v29, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v32, v36, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v28, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v28, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v28 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v29.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v183 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v36, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v27, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v29, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v28.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v27, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v182 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v27.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v26, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v27, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v26, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v25, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v25, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v24, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v26, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v25.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v24, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v24 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v25, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v183, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v182, v32, 16, v33 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v177 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v177 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v38, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v23, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v24, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v23, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v22, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v22 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v23.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v21, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v23, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v22.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v177, v33, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v33 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v32, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v20 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v21.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v18 -; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v21, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v176 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v19, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v19, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v176, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v39, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v19.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v181 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v36, v49, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v18, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v20, v32 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v48, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v16 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v19, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v17, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v17, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v16, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v16, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v17.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 +; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v50, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v18, v34 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v48, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v17, v35 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v36, v16 -; GFX11-TRUE16-NEXT: .LBB9_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v181, v39, 16, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-TRUE16-NEXT: .LBB19_3: ; %end +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v181 :: v_dual_mov_b32 v19, v175 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v21, v176 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 +; GFX11-TRUE16-NEXT: s_clause 0x8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v177 :: v_dual_mov_b32 v27, v182 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v183 :: v_dual_mov_b32 v31, v178 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v179 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB19_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-TRUE16-NEXT: s_branch .LBB19_2 ; -; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32i32: +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32i32_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v14, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v32, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v39, v14, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v15, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v15 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v14 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v13, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v13, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v12, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v11, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v11, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v8, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v8, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v9, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v6, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v6, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v7, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 +; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_and_b32 s5, s27, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v5 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 +; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v31, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v30, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v29 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v30, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v29, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v28, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v28, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v27, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v26, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v25 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v26, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v24, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v23 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v23, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v22 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v21, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v21, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v38, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v19, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v19, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v19 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v18, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v18, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v16 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v17, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v17 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v16, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v36, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v16, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB9_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-FAKE16-NEXT: .LBB19_3: ; %end +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 +; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB19_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-FAKE16-NEXT: s_branch .LBB19_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v32i32_to_v64f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB20_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v2 +; SI-NEXT: v_mov_b32_e32 v52, v29 +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v56, v28 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: .LBB20_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i32_to_v64f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i32_to_v64f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i32_to_v64f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i32_to_v64f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_readfirstlane_b32 s47, v1 +; SI-NEXT: v_readfirstlane_b32 s46, v2 +; SI-NEXT: v_readfirstlane_b32 s45, v3 +; SI-NEXT: v_readfirstlane_b32 s44, v4 +; SI-NEXT: v_readfirstlane_b32 s43, v5 +; SI-NEXT: v_readfirstlane_b32 s42, v6 +; SI-NEXT: v_readfirstlane_b32 s41, v7 +; SI-NEXT: v_readfirstlane_b32 s40, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s6, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s44, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s45, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s46, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s47, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s35, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s35 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_lshr_b32 s34, s6, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, s34 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s56, s18, 16 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: s_lshr_b32 s58, s20, 16 +; SI-NEXT: s_lshr_b32 s59, s21, 16 +; SI-NEXT: s_lshr_b32 s60, s22, 16 +; SI-NEXT: s_lshr_b32 s61, s23, 16 +; SI-NEXT: s_lshr_b32 s62, s24, 16 +; SI-NEXT: s_lshr_b32 s63, s25, 16 +; SI-NEXT: s_lshr_b32 s72, s26, 16 +; SI-NEXT: s_lshr_b32 s73, s27, 16 +; SI-NEXT: s_lshr_b32 s74, s28, 16 +; SI-NEXT: s_lshr_b32 s75, s29, 16 +; SI-NEXT: s_lshr_b32 s76, s47, 16 +; SI-NEXT: s_lshr_b32 s77, s46, 16 +; SI-NEXT: s_lshr_b32 s78, s45, 16 +; SI-NEXT: s_lshr_b32 s79, s44, 16 +; SI-NEXT: s_lshr_b32 s88, s43, 16 +; SI-NEXT: s_lshr_b32 s89, s42, 16 +; SI-NEXT: s_lshr_b32 s90, s41, 16 +; SI-NEXT: s_lshr_b32 s91, s40, 16 +; SI-NEXT: s_lshr_b32 s92, s15, 16 +; SI-NEXT: s_lshr_b32 s93, s14, 16 +; SI-NEXT: s_lshr_b32 s94, s13, 16 +; SI-NEXT: s_lshr_b32 s95, s12, 16 +; SI-NEXT: s_lshr_b32 vcc_lo, s11, 16 +; SI-NEXT: s_lshr_b32 vcc_hi, s10, 16 +; SI-NEXT: s_lshr_b32 s30, s8, 16 +; SI-NEXT: s_lshr_b32 s31, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, s31 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s30 +; SI-NEXT: v_cvt_f32_f16_e32 v6, vcc_hi +; SI-NEXT: v_cvt_f32_f16_e32 v9, vcc_lo +; SI-NEXT: v_cvt_f32_f16_e32 v11, s95 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v61 +; SI-NEXT: v_or_b32_e32 v2, v2, v61 +; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v57, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v57, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v47, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v45, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v45, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v43, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v43, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v41, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v55, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v53, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v51, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v48, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v48, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v38, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v38, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v36, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v36, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v34, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v32, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v30, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v32i32_to_v64f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v32i32_to_v64f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v32i32_to_v64f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB21_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: .LBB21_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define <32 x i32> @bitcast_v64f16_to_v32i32(<64 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v64f16_to_v32i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v36 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v58, v2 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_or_b32_e32 v25, v51, v25 +; SI-NEXT: v_or_b32_e32 v26, v48, v26 +; SI-NEXT: v_or_b32_e32 v27, v52, v27 +; SI-NEXT: v_or_b32_e32 v28, v39, v28 +; SI-NEXT: v_or_b32_e32 v29, v37, v29 +; SI-NEXT: v_or_b32_e32 v30, v35, v30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: v_or_b32_e32 v24, v55, v24 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB22_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v37 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v52 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: .LBB22_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v32i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v32, 0x200 +; VI-NEXT: v_add_f16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v33 +; VI-NEXT: v_add_f16_sdwa v33, v14, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v33 +; VI-NEXT: v_add_f16_sdwa v33, v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v33 +; VI-NEXT: v_add_f16_sdwa v33, v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v33 +; VI-NEXT: v_add_f16_sdwa v33, v11, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v33 +; VI-NEXT: v_add_f16_sdwa v33, v10, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v33 +; VI-NEXT: v_add_f16_sdwa v33, v9, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v33 +; VI-NEXT: v_add_f16_sdwa v33, v8, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v33 +; VI-NEXT: v_add_f16_sdwa v33, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v33 +; VI-NEXT: v_add_f16_sdwa v33, v6, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v33 +; VI-NEXT: v_add_f16_sdwa v33, v5, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v33 +; VI-NEXT: v_add_f16_sdwa v33, v4, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v33 +; VI-NEXT: v_add_f16_sdwa v33, v3, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v33 +; VI-NEXT: v_add_f16_sdwa v33, v2, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v33 +; VI-NEXT: v_add_f16_sdwa v33, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v33 +; VI-NEXT: v_add_f16_sdwa v33, v0, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v33, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_or_b32_e32 v31, v31, v33 +; VI-NEXT: v_add_f16_sdwa v33, v30, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_or_b32_e32 v30, v30, v33 +; VI-NEXT: v_add_f16_sdwa v33, v29, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_f16_sdwa v33, v28, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_add_f16_sdwa v33, v27, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_add_f16_sdwa v33, v26, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_add_f16_sdwa v33, v25, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v33 +; VI-NEXT: v_add_f16_sdwa v33, v24, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v33 +; VI-NEXT: v_add_f16_sdwa v33, v23, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v33 +; VI-NEXT: v_add_f16_sdwa v33, v22, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v33 +; VI-NEXT: v_add_f16_sdwa v33, v21, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_or_b32_e32 v21, v21, v33 +; VI-NEXT: v_add_f16_sdwa v33, v20, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_or_b32_e32 v20, v20, v33 +; VI-NEXT: v_add_f16_sdwa v33, v19, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_or_b32_e32 v19, v19, v33 +; VI-NEXT: v_add_f16_sdwa v33, v18, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_or_b32_e32 v18, v18, v33 +; VI-NEXT: v_add_f16_sdwa v33, v17, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v32, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v17, v33 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 +; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64f16_to_v32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64f16_to_v32i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64f16_to_v32i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v26 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v41, v11 +; SI-NEXT: v_mov_b32_e32 v40, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v37 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v62 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v63 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB23_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_mov_b32_e32 v33, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_or_b32_e32 v22, v51, v22 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v50, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v49, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v48, v25 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v26, v39, v26 +; SI-NEXT: v_mov_b32_e32 v39, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v27, v38, v27 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v28, v37, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_or_b32_e32 v9, v14, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_or_b32_e32 v19, v54, v19 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v2, v60, v2 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_or_b32_e32 v13, v57, v13 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v15, v43, v15 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v40, v17 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_or_b32_e32 v18, v55, v18 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB23_3 +; SI-NEXT: .LBB23_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_mov_b32_e32 v33, v52 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_mov_b32_e32 v39, v27 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB23_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v35, v40 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v40, v46 +; SI-NEXT: v_mov_b32_e32 v41, v56 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v43, v60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB23_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_mov_b32_e32 v55, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: .LBB23_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v32i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v18, 0x200 +; VI-NEXT: v_add_f16_sdwa v33, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v33 +; VI-NEXT: v_add_f16_sdwa v33, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v33 +; VI-NEXT: v_add_f16_sdwa v33, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v33 +; VI-NEXT: v_add_f16_sdwa v33, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v33 +; VI-NEXT: v_add_f16_sdwa v33, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v33 +; VI-NEXT: v_add_f16_sdwa v33, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v33 +; VI-NEXT: v_add_f16_sdwa v33, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v33 +; VI-NEXT: v_add_f16_sdwa v33, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v33 +; VI-NEXT: v_add_f16_sdwa v33, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v33 +; VI-NEXT: v_add_f16_sdwa v33, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v33 +; VI-NEXT: v_add_f16_sdwa v33, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v33 +; VI-NEXT: v_add_f16_sdwa v33, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v33 +; VI-NEXT: v_add_f16_sdwa v33, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v33 +; VI-NEXT: v_add_f16_sdwa v33, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v33 +; VI-NEXT: v_add_f16_sdwa v33, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v33 +; VI-NEXT: v_add_f16_sdwa v33, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v33 +; VI-NEXT: v_add_f16_sdwa v33, v31, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_or_b32_e32 v31, v31, v33 +; VI-NEXT: v_add_f16_sdwa v33, v30, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_or_b32_e32 v30, v30, v33 +; VI-NEXT: v_add_f16_sdwa v33, v29, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_f16_sdwa v33, v28, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_add_f16_sdwa v33, v27, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_add_f16_sdwa v33, v26, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_add_f16_sdwa v33, v25, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v33 +; VI-NEXT: v_add_f16_sdwa v33, v24, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v33 +; VI-NEXT: v_add_f16_sdwa v33, v23, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v33 +; VI-NEXT: v_add_f16_sdwa v33, v22, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v33 +; VI-NEXT: v_add_f16_sdwa v33, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_or_b32_e32 v21, v21, v33 +; VI-NEXT: v_add_f16_sdwa v33, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_or_b32_e32 v20, v20, v33 +; VI-NEXT: v_add_f16_sdwa v33, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_or_b32_e32 v19, v19, v33 +; VI-NEXT: v_add_f16_sdwa v33, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v32, v32, v33 +; VI-NEXT: v_add_f16_sdwa v33, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v18, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v17, v33 +; VI-NEXT: v_or_b32_e32 v16, v16, v18 +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v64f16_to_v32i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v31, v31, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, v32, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v64f16_to_v32i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB23_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: s_branch .LBB23_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) - %a2 = bitcast <64 x bfloat> %a1 to <32 x i32> + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <32 x i32> br label %end cmp.false: - %a3 = bitcast <64 x bfloat> %a to <32 x i32> + %a3 = bitcast <64 x half> %a to <32 x i32> br label %end end: @@ -15571,780 +35917,400 @@ end: ret <32 x i32> %phi } -define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i32_to_v64f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v62 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v63 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v36 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v38 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v48 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v49 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v50 -; GCN-NEXT: v_mov_b32_e32 v50, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: .LBB10_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v31, v33 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v61 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: .LBB10_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v59 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v59, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v58 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v56, v2, v1 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v57, v2, v1 -; GCN-NEXT: v_add_i32_e32 v60, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v47 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v2, v1 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v46 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v45 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v44 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v43 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v42 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v41 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v40 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v54 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v53 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v51 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v49 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v48 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v36 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v59, v61, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v60, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v32i32_to_v64i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v52, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v41, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v43, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: .LBB24_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v52, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v41, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v43, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: .LBB24_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v32i32_to_v64f16: +; VI-LABEL: bitcast_v32i32_to_v64i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 @@ -16354,7 +36320,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 @@ -16389,12 +36355,12 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 ; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB24_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v32i32_to_v64f16: +; GFX9-LABEL: bitcast_v32i32_to_v64i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 @@ -16404,7 +36370,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 ; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 @@ -16439,958 +36405,2659 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 ; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 ; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i32_to_v64i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i32_to_v64i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_readfirstlane_b32 s47, v1 +; SI-NEXT: v_readfirstlane_b32 s46, v2 +; SI-NEXT: v_readfirstlane_b32 s45, v3 +; SI-NEXT: v_readfirstlane_b32 s44, v4 +; SI-NEXT: v_readfirstlane_b32 s43, v5 +; SI-NEXT: v_readfirstlane_b32 s42, v6 +; SI-NEXT: v_readfirstlane_b32 s41, v7 +; SI-NEXT: v_readfirstlane_b32 s40, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v16 +; SI-NEXT: v_readfirstlane_b32 s7, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s45 +; SI-NEXT: v_mov_b32_e32 v9, s47 +; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_mov_b32_e32 v11, s26 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s18 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s46, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s29, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s27, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s25, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s23, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s21, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, s19, v15, 16 +; SI-NEXT: v_alignbit_b32 v16, s17, v16, 16 +; SI-NEXT: s_lshr_b32 s56, s6, 16 +; SI-NEXT: s_lshr_b32 s57, s8, 16 +; SI-NEXT: s_lshr_b32 s58, s10, 16 +; SI-NEXT: s_lshr_b32 s59, s12, 16 +; SI-NEXT: s_lshr_b32 s60, s14, 16 +; SI-NEXT: s_lshr_b32 s61, s40, 16 +; SI-NEXT: s_lshr_b32 s62, s42, 16 +; SI-NEXT: s_lshr_b32 s63, s44, 16 +; SI-NEXT: s_lshr_b32 s72, s46, 16 +; SI-NEXT: s_lshr_b32 s73, s29, 16 +; SI-NEXT: s_lshr_b32 s74, s27, 16 +; SI-NEXT: s_lshr_b32 s75, s25, 16 +; SI-NEXT: s_lshr_b32 s76, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s21, 16 +; SI-NEXT: s_lshr_b32 s78, s19, 16 +; SI-NEXT: s_lshr_b32 s79, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s45 +; SI-NEXT: v_mov_b32_e32 v9, s47 +; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_mov_b32_e32 v11, s26 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s18 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s46, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s29, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s27, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s25, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s23, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s21, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, s19, v15, 16 +; SI-NEXT: v_alignbit_b32 v16, s17, v16, 16 +; SI-NEXT: s_lshr_b32 s56, s6, 16 +; SI-NEXT: s_lshr_b32 s57, s8, 16 +; SI-NEXT: s_lshr_b32 s58, s10, 16 +; SI-NEXT: s_lshr_b32 s59, s12, 16 +; SI-NEXT: s_lshr_b32 s60, s14, 16 +; SI-NEXT: s_lshr_b32 s61, s40, 16 +; SI-NEXT: s_lshr_b32 s62, s42, 16 +; SI-NEXT: s_lshr_b32 s63, s44, 16 +; SI-NEXT: s_lshr_b32 s72, s46, 16 +; SI-NEXT: s_lshr_b32 s73, s29, 16 +; SI-NEXT: s_lshr_b32 s74, s27, 16 +; SI-NEXT: s_lshr_b32 s75, s25, 16 +; SI-NEXT: s_lshr_b32 s76, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s21, 16 +; SI-NEXT: s_lshr_b32 s78, s19, 16 +; SI-NEXT: s_lshr_b32 s79, s17, 16 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, s4, v16 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s79, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v16, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v16, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s77, 16 +; SI-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v15, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v13, s4, v13 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s75, 16 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s47, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s46, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s45, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s44, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s42, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: s_branch .LBB25_2 +; +; VI-LABEL: bitcast_v32i32_to_v64i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v32i32_to_v64i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-LABEL: bitcast_v32i32_to_v64i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB25_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v64i16_to_v32i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v23, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v53 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v19, v19, v48 +; SI-NEXT: v_or_b32_e32 v21, v21, v36 +; SI-NEXT: v_or_b32_e32 v22, v22, v34 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v49 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v62 +; SI-NEXT: v_or_b32_e32 v2, v2, v61 +; SI-NEXT: v_or_b32_e32 v3, v3, v60 +; SI-NEXT: v_or_b32_e32 v4, v4, v59 +; SI-NEXT: v_or_b32_e32 v5, v5, v58 +; SI-NEXT: v_or_b32_e32 v6, v6, v57 +; SI-NEXT: v_or_b32_e32 v7, v7, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v47 +; SI-NEXT: v_or_b32_e32 v9, v9, v46 +; SI-NEXT: v_or_b32_e32 v10, v10, v45 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_or_b32_e32 v12, v12, v42 +; SI-NEXT: v_or_b32_e32 v13, v13, v41 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v54 +; SI-NEXT: v_or_b32_e32 v20, v20, v38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: .LBB26_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: v_or_b32_e32 v21, v36, v21 +; SI-NEXT: v_or_b32_e32 v22, v34, v22 +; SI-NEXT: v_or_b32_e32 v23, v32, v23 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_or_b32_e32 v4, v59, v4 +; SI-NEXT: v_or_b32_e32 v5, v58, v5 +; SI-NEXT: v_or_b32_e32 v6, v57, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v47, v8 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v10, v45, v10 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_or_b32_e32 v13, v41, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v54, v15 +; SI-NEXT: v_or_b32_e32 v18, v49, v18 +; SI-NEXT: v_or_b32_e32 v20, v38, v20 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v51, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v37, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 +; SI-NEXT: .LBB26_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i16_to_v32i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v33, 3 +; VI-NEXT: v_add_u16_e32 v32, 3, v15 +; VI-NEXT: v_add_u16_sdwa v15, v15, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v32, v15 +; VI-NEXT: v_add_u16_e32 v32, 3, v14 +; VI-NEXT: v_add_u16_sdwa v14, v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v32, v14 +; VI-NEXT: v_add_u16_e32 v32, 3, v13 +; VI-NEXT: v_add_u16_sdwa v13, v13, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v32, v13 +; VI-NEXT: v_add_u16_e32 v32, 3, v12 +; VI-NEXT: v_add_u16_sdwa v12, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v32, v12 +; VI-NEXT: v_add_u16_e32 v32, 3, v11 +; VI-NEXT: v_add_u16_sdwa v11, v11, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v32, v11 +; VI-NEXT: v_add_u16_e32 v32, 3, v10 +; VI-NEXT: v_add_u16_sdwa v10, v10, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v32, v10 +; VI-NEXT: v_add_u16_e32 v32, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v32, v9 +; VI-NEXT: v_add_u16_e32 v32, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v32, v8 +; VI-NEXT: v_add_u16_e32 v32, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v32, v7 +; VI-NEXT: v_add_u16_e32 v32, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v32, v6 +; VI-NEXT: v_add_u16_e32 v32, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v32, v5 +; VI-NEXT: v_add_u16_e32 v32, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v32, v4 +; VI-NEXT: v_add_u16_e32 v32, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v32, v3 +; VI-NEXT: v_add_u16_e32 v32, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v32, v2 +; VI-NEXT: v_add_u16_e32 v32, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v32, v1 +; VI-NEXT: v_add_u16_e32 v32, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v32, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 3, v31 +; VI-NEXT: v_add_u16_sdwa v31, v31, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v31, v32, v31 +; VI-NEXT: v_add_u16_e32 v32, 3, v30 +; VI-NEXT: v_add_u16_sdwa v30, v30, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v32, v30 +; VI-NEXT: v_add_u16_e32 v32, 3, v29 +; VI-NEXT: v_add_u16_sdwa v29, v29, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v32, v29 +; VI-NEXT: v_add_u16_e32 v32, 3, v28 +; VI-NEXT: v_add_u16_sdwa v28, v28, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v32, v28 +; VI-NEXT: v_add_u16_e32 v32, 3, v27 +; VI-NEXT: v_add_u16_sdwa v27, v27, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v32, v27 +; VI-NEXT: v_add_u16_e32 v32, 3, v26 +; VI-NEXT: v_add_u16_sdwa v26, v26, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v32, v26 +; VI-NEXT: v_add_u16_e32 v32, 3, v25 +; VI-NEXT: v_add_u16_sdwa v25, v25, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v32, v25 +; VI-NEXT: v_add_u16_e32 v32, 3, v24 +; VI-NEXT: v_add_u16_sdwa v24, v24, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v32, v24 +; VI-NEXT: v_add_u16_e32 v32, 3, v23 +; VI-NEXT: v_add_u16_sdwa v23, v23, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v32, v23 +; VI-NEXT: v_add_u16_e32 v32, 3, v22 +; VI-NEXT: v_add_u16_sdwa v22, v22, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v32, v22 +; VI-NEXT: v_add_u16_e32 v32, 3, v21 +; VI-NEXT: v_add_u16_sdwa v21, v21, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v32, v21 +; VI-NEXT: v_add_u16_e32 v32, 3, v20 +; VI-NEXT: v_add_u16_sdwa v20, v20, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v32, v20 +; VI-NEXT: v_add_u16_e32 v32, 3, v19 +; VI-NEXT: v_add_u16_sdwa v19, v19, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v32, v19 +; VI-NEXT: v_add_u16_e32 v32, 3, v18 +; VI-NEXT: v_add_u16_sdwa v18, v18, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v32, v18 +; VI-NEXT: v_add_u16_e32 v32, 3, v17 +; VI-NEXT: v_add_u16_sdwa v17, v17, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v32, v17 +; VI-NEXT: v_add_u16_sdwa v32, v16, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 +; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i16_to_v32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v32i32_to_v64f16: +; GFX11-LABEL: bitcast_v64i16_to_v32i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i16_to_v32i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v56, v10 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v10, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v16 +; SI-NEXT: v_or_b32_e32 v16, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v17, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v19, v0, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_or_b32_e32 v20, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v21, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v22, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_or_b32_e32 v23, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 +; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 +; SI-NEXT: v_mov_b32_e32 v26, v27 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_or_b32_e32 v28, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_or_b32_e32 v29, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v30, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_or_b32_e32 v8, v1, v55 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_mov_b32_e32 v56, v16 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v64i16_to_v32i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v32, s30, 0 +; VI-NEXT: v_writelane_b32 v32, s31, 1 +; VI-NEXT: v_writelane_b32 v32, s34, 2 +; VI-NEXT: v_writelane_b32 v32, s35, 3 +; VI-NEXT: v_writelane_b32 v32, s36, 4 +; VI-NEXT: v_writelane_b32 v32, s37, 5 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_writelane_b32 v32, s38, 6 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: v_readfirstlane_b32 s46, v3 +; VI-NEXT: v_readfirstlane_b32 s45, v4 +; VI-NEXT: v_readfirstlane_b32 s44, v5 +; VI-NEXT: v_readfirstlane_b32 s43, v6 +; VI-NEXT: v_readfirstlane_b32 s42, v7 +; VI-NEXT: v_readfirstlane_b32 s41, v8 +; VI-NEXT: v_readfirstlane_b32 s40, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 +; VI-NEXT: v_readfirstlane_b32 s13, v12 +; VI-NEXT: v_readfirstlane_b32 s12, v13 +; VI-NEXT: v_readfirstlane_b32 s11, v14 +; VI-NEXT: v_readfirstlane_b32 s10, v15 +; VI-NEXT: v_readfirstlane_b32 s9, v16 +; VI-NEXT: v_readfirstlane_b32 s8, v17 +; VI-NEXT: v_readfirstlane_b32 s7, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s6, v1 +; VI-NEXT: v_writelane_b32 v32, s39, 7 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s47, 3 +; VI-NEXT: s_and_b32 s47, s46, 0xffff0000 +; VI-NEXT: s_add_i32 s46, s46, 3 +; VI-NEXT: s_and_b32 s56, s45, 0xffff0000 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_and_b32 s57, s44, 0xffff0000 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_and_b32 s58, s43, 0xffff0000 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_and_b32 s59, s42, 0xffff0000 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_and_b32 s60, s41, 0xffff0000 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_and_b32 s61, s40, 0xffff0000 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_and_b32 s62, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_and_b32 s63, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_and_b32 s72, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_and_b32 s73, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_and_b32 s74, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_and_b32 s75, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_and_b32 s76, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s77, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_and_b32 s78, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s79, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s88, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s89, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s90, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s91, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_and_b32 vcc_lo, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_and_b32 vcc_hi, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_and_b32 s30, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s31, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s34, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s35, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s36, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s37, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s38, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s39, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s40, s40, 0xffff +; VI-NEXT: s_and_b32 s41, s41, 0xffff +; VI-NEXT: s_and_b32 s42, s42, 0xffff +; VI-NEXT: s_and_b32 s43, s43, 0xffff +; VI-NEXT: s_and_b32 s44, s44, 0xffff +; VI-NEXT: s_and_b32 s45, s45, 0xffff +; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s6, s39, s6 +; VI-NEXT: s_or_b32 s7, s38, s7 +; VI-NEXT: s_or_b32 s29, s37, s29 +; VI-NEXT: s_or_b32 s28, s36, s28 +; VI-NEXT: s_or_b32 s27, s35, s27 +; VI-NEXT: s_or_b32 s26, s34, s26 +; VI-NEXT: s_or_b32 s25, s31, s25 +; VI-NEXT: s_or_b32 s24, s30, s24 +; VI-NEXT: s_or_b32 s23, vcc_hi, s23 +; VI-NEXT: s_or_b32 s22, vcc_lo, s22 +; VI-NEXT: s_or_b32 s21, s91, s21 +; VI-NEXT: s_or_b32 s20, s90, s20 +; VI-NEXT: s_or_b32 s19, s89, s19 +; VI-NEXT: s_or_b32 s18, s88, s18 +; VI-NEXT: s_or_b32 s17, s79, s17 +; VI-NEXT: s_or_b32 s16, s78, s16 +; VI-NEXT: s_or_b32 s8, s77, s8 +; VI-NEXT: s_or_b32 s9, s76, s9 +; VI-NEXT: s_or_b32 s10, s75, s10 +; VI-NEXT: s_or_b32 s11, s74, s11 +; VI-NEXT: s_or_b32 s12, s73, s12 +; VI-NEXT: s_or_b32 s13, s72, s13 +; VI-NEXT: s_or_b32 s14, s63, s14 +; VI-NEXT: s_or_b32 s15, s62, s15 +; VI-NEXT: s_or_b32 s40, s61, s40 +; VI-NEXT: s_or_b32 s41, s60, s41 +; VI-NEXT: s_or_b32 s42, s59, s42 +; VI-NEXT: s_or_b32 s43, s58, s43 +; VI-NEXT: s_or_b32 s44, s57, s44 +; VI-NEXT: s_or_b32 s45, s56, s45 +; VI-NEXT: s_or_b32 s46, s47, s46 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_add_i32 s25, s25, 0x30000 +; VI-NEXT: s_add_i32 s24, s24, 0x30000 +; VI-NEXT: s_add_i32 s23, s23, 0x30000 +; VI-NEXT: s_add_i32 s22, s22, 0x30000 +; VI-NEXT: s_add_i32 s21, s21, 0x30000 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s40, s40, 0x30000 +; VI-NEXT: s_add_i32 s41, s41, 0x30000 +; VI-NEXT: s_add_i32 s42, s42, 0x30000 +; VI-NEXT: s_add_i32 s43, s43, 0x30000 +; VI-NEXT: s_add_i32 s44, s44, 0x30000 +; VI-NEXT: s_add_i32 s45, s45, 0x30000 +; VI-NEXT: s_add_i32 s46, s46, 0x30000 +; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s7 +; VI-NEXT: v_mov_b32_e32 v15, s6 +; VI-NEXT: v_mov_b32_e32 v16, s47 +; VI-NEXT: v_mov_b32_e32 v17, s46 +; VI-NEXT: v_mov_b32_e32 v18, s45 +; VI-NEXT: v_mov_b32_e32 v19, s44 +; VI-NEXT: v_mov_b32_e32 v20, s43 +; VI-NEXT: v_mov_b32_e32 v21, s42 +; VI-NEXT: v_mov_b32_e32 v22, s41 +; VI-NEXT: v_mov_b32_e32 v23, s40 +; VI-NEXT: v_mov_b32_e32 v24, s15 +; VI-NEXT: v_mov_b32_e32 v25, s14 +; VI-NEXT: v_mov_b32_e32 v26, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v28, s11 +; VI-NEXT: v_mov_b32_e32 v29, s10 +; VI-NEXT: v_mov_b32_e32 v30, s9 +; VI-NEXT: v_mov_b32_e32 v31, s8 +; VI-NEXT: v_readlane_b32 s39, v32, 7 +; VI-NEXT: v_readlane_b32 s38, v32, 6 +; VI-NEXT: v_readlane_b32 s37, v32, 5 +; VI-NEXT: v_readlane_b32 s36, v32, 4 +; VI-NEXT: v_readlane_b32 s35, v32, 3 +; VI-NEXT: v_readlane_b32 s34, v32, 2 +; VI-NEXT: v_readlane_b32 s31, v32, 1 +; VI-NEXT: v_readlane_b32 s30, v32, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v64i16_to_v32i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v64i16_to_v32i32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: .LBB10_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_3 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB27_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: s_branch .LBB27_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <32 x i32> %a, splat (i32 3) - %a2 = bitcast <32 x i32> %a1 to <64 x half> + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <32 x i32> br label %end cmp.false: - %a3 = bitcast <32 x i32> %a to <64 x half> + %a3 = bitcast <64 x i16> %a to <32 x i32> br label %end end: - %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <64 x half> %phi + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi } -define <32 x i32> @bitcast_v64f16_to_v32i32(<64 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v64f16_to_v32i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v43 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v50 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v37 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v1 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v61 -; GCN-NEXT: v_or_b32_e32 v0, v62, v0 -; GCN-NEXT: v_or_b32_e32 v1, v60, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; GCN-NEXT: v_or_b32_e32 v2, v58, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v57 -; GCN-NEXT: v_or_b32_e32 v3, v56, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v47 -; GCN-NEXT: v_or_b32_e32 v4, v46, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v51 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v44 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v32 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v37 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: v_or_b32_e32 v19, v43, v19 -; GCN-NEXT: v_or_b32_e32 v20, v41, v20 -; GCN-NEXT: v_or_b32_e32 v21, v55, v21 -; GCN-NEXT: v_or_b32_e32 v22, v49, v22 -; GCN-NEXT: v_or_b32_e32 v23, v50, v23 -; GCN-NEXT: v_or_b32_e32 v24, v39, v24 -; GCN-NEXT: v_or_b32_e32 v25, v36, v25 -; GCN-NEXT: v_or_b32_e32 v26, v48, v26 -; GCN-NEXT: v_or_b32_e32 v27, v52, v27 -; GCN-NEXT: v_or_b32_e32 v28, v53, v28 -; GCN-NEXT: v_or_b32_e32 v29, v54, v29 -; GCN-NEXT: v_or_b32_e32 v30, v40, v30 -; GCN-NEXT: v_or_b32_e32 v31, v42, v31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: .LBB11_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v58 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v56 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v51 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v41 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v55 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v50 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v39 -; GCN-NEXT: v_mov_b32_e32 v39, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v48 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_or_b32_e32 v19, v21, v20 -; GCN-NEXT: v_or_b32_e32 v20, v55, v39 -; GCN-NEXT: v_or_b32_e32 v21, v41, v48 -; GCN-NEXT: v_or_b32_e32 v22, v22, v49 -; GCN-NEXT: v_or_b32_e32 v23, v23, v50 -; GCN-NEXT: v_or_b32_e32 v24, v24, v51 -; GCN-NEXT: v_or_b32_e32 v25, v25, v32 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v38 -; GCN-NEXT: v_or_b32_e32 v28, v28, v33 -; GCN-NEXT: v_or_b32_e32 v29, v29, v34 -; GCN-NEXT: v_or_b32_e32 v30, v30, v35 -; GCN-NEXT: v_or_b32_e32 v31, v31, v37 -; GCN-NEXT: .LBB11_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v32f32_to_v16i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB28_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v64f16_to_v32i32: +; VI-LABEL: bitcast_v32f32_to_v16i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v32, 0x200 -; VI-NEXT: v_add_f16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 -; VI-NEXT: v_or_b32_e32 v15, v15, v33 -; VI-NEXT: v_add_f16_sdwa v33, v14, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 -; VI-NEXT: v_or_b32_e32 v14, v14, v33 -; VI-NEXT: v_add_f16_sdwa v33, v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 -; VI-NEXT: v_or_b32_e32 v13, v13, v33 -; VI-NEXT: v_add_f16_sdwa v33, v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 -; VI-NEXT: v_or_b32_e32 v12, v12, v33 -; VI-NEXT: v_add_f16_sdwa v33, v11, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 -; VI-NEXT: v_or_b32_e32 v11, v11, v33 -; VI-NEXT: v_add_f16_sdwa v33, v10, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 -; VI-NEXT: v_or_b32_e32 v10, v10, v33 -; VI-NEXT: v_add_f16_sdwa v33, v9, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 -; VI-NEXT: v_or_b32_e32 v9, v9, v33 -; VI-NEXT: v_add_f16_sdwa v33, v8, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v33 -; VI-NEXT: v_add_f16_sdwa v33, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v33 -; VI-NEXT: v_add_f16_sdwa v33, v6, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 -; VI-NEXT: v_or_b32_e32 v6, v6, v33 -; VI-NEXT: v_add_f16_sdwa v33, v5, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v33 -; VI-NEXT: v_add_f16_sdwa v33, v4, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 -; VI-NEXT: v_or_b32_e32 v4, v4, v33 -; VI-NEXT: v_add_f16_sdwa v33, v3, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v33 -; VI-NEXT: v_add_f16_sdwa v33, v2, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_or_b32_e32 v2, v2, v33 -; VI-NEXT: v_add_f16_sdwa v33, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v33 -; VI-NEXT: v_add_f16_sdwa v33, v0, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v33 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f16_sdwa v33, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 -; VI-NEXT: v_or_b32_e32 v31, v31, v33 -; VI-NEXT: v_add_f16_sdwa v33, v30, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 -; VI-NEXT: v_or_b32_e32 v30, v30, v33 -; VI-NEXT: v_add_f16_sdwa v33, v29, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 -; VI-NEXT: v_or_b32_e32 v29, v29, v33 -; VI-NEXT: v_add_f16_sdwa v33, v28, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 -; VI-NEXT: v_or_b32_e32 v28, v28, v33 -; VI-NEXT: v_add_f16_sdwa v33, v27, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 -; VI-NEXT: v_or_b32_e32 v27, v27, v33 -; VI-NEXT: v_add_f16_sdwa v33, v26, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 -; VI-NEXT: v_or_b32_e32 v26, v26, v33 -; VI-NEXT: v_add_f16_sdwa v33, v25, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 -; VI-NEXT: v_or_b32_e32 v25, v25, v33 -; VI-NEXT: v_add_f16_sdwa v33, v24, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 -; VI-NEXT: v_or_b32_e32 v24, v24, v33 -; VI-NEXT: v_add_f16_sdwa v33, v23, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 -; VI-NEXT: v_or_b32_e32 v23, v23, v33 -; VI-NEXT: v_add_f16_sdwa v33, v22, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 -; VI-NEXT: v_or_b32_e32 v22, v22, v33 -; VI-NEXT: v_add_f16_sdwa v33, v21, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 -; VI-NEXT: v_or_b32_e32 v21, v21, v33 -; VI-NEXT: v_add_f16_sdwa v33, v20, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 -; VI-NEXT: v_or_b32_e32 v20, v20, v33 -; VI-NEXT: v_add_f16_sdwa v33, v19, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 -; VI-NEXT: v_or_b32_e32 v19, v19, v33 -; VI-NEXT: v_add_f16_sdwa v33, v18, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 -; VI-NEXT: v_or_b32_e32 v18, v18, v33 -; VI-NEXT: v_add_f16_sdwa v33, v17, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 -; VI-NEXT: v_add_f16_sdwa v32, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 -; VI-NEXT: v_or_b32_e32 v17, v17, v33 -; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB28_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v64f16_to_v32i32: +; GFX9-LABEL: bitcast_v32f32_to_v16i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 @@ -17400,48 +39067,47 @@ define <32 x i32> @bitcast_v64f16_to_v32i32(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: s_movk_i32 s6, 0x200 -; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v30, v30, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v29, v29, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v28, v28, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v27, v27, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v26, v26, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v25, v25, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v24, v24, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v23, v23, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v22, v22, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v21, v21, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB28_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v64f16_to_v32i32: +; GFX11-LABEL: bitcast_v32f32_to_v16i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -17453,42 +39119,26 @@ define <32 x i32> @bitcast_v64f16_to_v32i32(<64 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB28_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17496,384 +39146,375 @@ define <32 x i32> @bitcast_v64f16_to_v32i32(<64 x half> %a, i32 %b) { br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <64 x half> %a, splat (half 0xH0200) - %a2 = bitcast <64 x half> %a1 to <32 x i32> + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <16 x i64> br label %end cmp.false: - %a3 = bitcast <64 x half> %a to <32 x i32> + %a3 = bitcast <32 x float> %a to <16 x i64> br label %end end: - %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <32 x i32> %phi + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi } -define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i32_to_v64i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; kill: killed $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; kill: killed $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v50, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v52, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v55, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v41, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: .LBB12_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v50, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v52, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v55, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v41, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: .LBB12_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v59 -; GCN-NEXT: v_or_b32_e32 v1, v1, v44 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; GCN-NEXT: v_or_b32_e32 v59, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_or_b32_e32 v57, v1, v2 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; GCN-NEXT: v_or_b32_e32 v63, v1, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; GCN-NEXT: v_or_b32_e32 v46, v1, v3 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v43 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v61 -; GCN-NEXT: v_or_b32_e32 v61, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v41 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v60 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v55 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v58 -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v56 -; GCN-NEXT: v_or_b32_e32 v14, v14, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v15, v15, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v47 -; GCN-NEXT: v_or_b32_e32 v16, v16, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v17, v17, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v45 -; GCN-NEXT: v_or_b32_e32 v18, v18, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v19, v19, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v20, v20, v34 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v38 -; GCN-NEXT: v_or_b32_e32 v21, v21, v34 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v42 -; GCN-NEXT: v_or_b32_e32 v22, v22, v34 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v37 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v40 -; GCN-NEXT: v_or_b32_e32 v24, v24, v34 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v36 -; GCN-NEXT: v_or_b32_e32 v25, v25, v34 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v54 -; GCN-NEXT: v_or_b32_e32 v26, v26, v34 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v35 -; GCN-NEXT: v_or_b32_e32 v27, v27, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v53 -; GCN-NEXT: v_or_b32_e32 v28, v28, v34 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v29, v29, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v30, v30, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v32, v32, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v31, v31, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v59, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x i64> @bitcast_v32f32_to_v16i64_scalar(<32 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f32_to_v16i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: s_branch .LBB29_2 ; -; VI-LABEL: bitcast_v32i32_to_v64i16: +; VI-LABEL: bitcast_v32f32_to_v16i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB29_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: s_branch .LBB29_2 +; +; GFX9-LABEL: bitcast_v32f32_to_v16i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: s_branch .LBB29_2 +; +; GFX11-LABEL: bitcast_v32f32_to_v16i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB29_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: .LBB29_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define <32 x float> @bitcast_v16i64_to_v32f32(<16 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v16i64_to_v32f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB30_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i64_to_v32f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 @@ -17883,47 +39524,47 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 ; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: .LBB12_2: ; %end +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB30_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v32i32_to_v64i16: +; GFX9-LABEL: bitcast_v16i64_to_v32f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 @@ -17933,47 +39574,47 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 -; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 -; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 -; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 -; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 -; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 -; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 -; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 -; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 -; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 -; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 -; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB30_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v32i32_to_v64i16: +; GFX11-LABEL: bitcast_v16i64_to_v32f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -17985,42 +39626,50 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: .LBB12_2: ; %end +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB30_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -18028,912 +39677,399 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <32 x i32> %a, splat (i32 3) - %a2 = bitcast <32 x i32> %a1 to <64 x i16> + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <32 x float> br label %end cmp.false: - %a3 = bitcast <32 x i32> %a to <64 x i16> + %a3 = bitcast <16 x i64> %a to <32 x float> br label %end end: - %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <64 x i16> %phi + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi } -define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i16_to_v32i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(12) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v24 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v36 -; GCN-NEXT: v_or_b32_e32 v1, v1, v58 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v57 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v60 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v43 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v56 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v62 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v61 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v47 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v44 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v30, v32 -; GCN-NEXT: v_or_b32_e32 v31, v31, v59 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: .LBB13_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v36, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v58, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v57, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v35, v3 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v61 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v4, v60, v4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v32, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v32, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v32, v15 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v32, v16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v32, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v32, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v32, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v32, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v32, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v32, v30 -; GCN-NEXT: v_or_b32_e32 v31, v59, v31 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 -; GCN-NEXT: .LBB13_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(11) -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <32 x float> @bitcast_v16i64_to_v32f32_scalar(<16 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i64_to_v32f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: s_branch .LBB31_2 ; -; VI-LABEL: bitcast_v64i16_to_v32i32: +; VI-LABEL: bitcast_v16i64_to_v32f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v33, 3 -; VI-NEXT: v_add_u16_e32 v32, 3, v15 -; VI-NEXT: v_add_u16_sdwa v15, v15, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v32, v15 -; VI-NEXT: v_add_u16_e32 v32, 3, v14 -; VI-NEXT: v_add_u16_sdwa v14, v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v32, v14 -; VI-NEXT: v_add_u16_e32 v32, 3, v13 -; VI-NEXT: v_add_u16_sdwa v13, v13, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v32, v13 -; VI-NEXT: v_add_u16_e32 v32, 3, v12 -; VI-NEXT: v_add_u16_sdwa v12, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v32, v12 -; VI-NEXT: v_add_u16_e32 v32, 3, v11 -; VI-NEXT: v_add_u16_sdwa v11, v11, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v32, v11 -; VI-NEXT: v_add_u16_e32 v32, 3, v10 -; VI-NEXT: v_add_u16_sdwa v10, v10, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v32, v10 -; VI-NEXT: v_add_u16_e32 v32, 3, v9 -; VI-NEXT: v_add_u16_sdwa v9, v9, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v32, v9 -; VI-NEXT: v_add_u16_e32 v32, 3, v8 -; VI-NEXT: v_add_u16_sdwa v8, v8, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v32, v8 -; VI-NEXT: v_add_u16_e32 v32, 3, v7 -; VI-NEXT: v_add_u16_sdwa v7, v7, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v32, v7 -; VI-NEXT: v_add_u16_e32 v32, 3, v6 -; VI-NEXT: v_add_u16_sdwa v6, v6, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v32, v6 -; VI-NEXT: v_add_u16_e32 v32, 3, v5 -; VI-NEXT: v_add_u16_sdwa v5, v5, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v32, v5 -; VI-NEXT: v_add_u16_e32 v32, 3, v4 -; VI-NEXT: v_add_u16_sdwa v4, v4, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v32, v4 -; VI-NEXT: v_add_u16_e32 v32, 3, v3 -; VI-NEXT: v_add_u16_sdwa v3, v3, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v32, v3 -; VI-NEXT: v_add_u16_e32 v32, 3, v2 -; VI-NEXT: v_add_u16_sdwa v2, v2, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v32, v2 -; VI-NEXT: v_add_u16_e32 v32, 3, v1 -; VI-NEXT: v_add_u16_sdwa v1, v1, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v1, v32, v1 -; VI-NEXT: v_add_u16_e32 v32, 3, v0 -; VI-NEXT: v_add_u16_sdwa v0, v0, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v32, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v32, 3, v31 -; VI-NEXT: v_add_u16_sdwa v31, v31, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v31, v32, v31 -; VI-NEXT: v_add_u16_e32 v32, 3, v30 -; VI-NEXT: v_add_u16_sdwa v30, v30, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v30, v32, v30 -; VI-NEXT: v_add_u16_e32 v32, 3, v29 -; VI-NEXT: v_add_u16_sdwa v29, v29, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v29, v32, v29 -; VI-NEXT: v_add_u16_e32 v32, 3, v28 -; VI-NEXT: v_add_u16_sdwa v28, v28, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v28, v32, v28 -; VI-NEXT: v_add_u16_e32 v32, 3, v27 -; VI-NEXT: v_add_u16_sdwa v27, v27, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v27, v32, v27 -; VI-NEXT: v_add_u16_e32 v32, 3, v26 -; VI-NEXT: v_add_u16_sdwa v26, v26, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v26, v32, v26 -; VI-NEXT: v_add_u16_e32 v32, 3, v25 -; VI-NEXT: v_add_u16_sdwa v25, v25, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v25, v32, v25 -; VI-NEXT: v_add_u16_e32 v32, 3, v24 -; VI-NEXT: v_add_u16_sdwa v24, v24, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v24, v32, v24 -; VI-NEXT: v_add_u16_e32 v32, 3, v23 -; VI-NEXT: v_add_u16_sdwa v23, v23, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v23, v32, v23 -; VI-NEXT: v_add_u16_e32 v32, 3, v22 -; VI-NEXT: v_add_u16_sdwa v22, v22, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v22, v32, v22 -; VI-NEXT: v_add_u16_e32 v32, 3, v21 -; VI-NEXT: v_add_u16_sdwa v21, v21, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v21, v32, v21 -; VI-NEXT: v_add_u16_e32 v32, 3, v20 -; VI-NEXT: v_add_u16_sdwa v20, v20, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v20, v32, v20 -; VI-NEXT: v_add_u16_e32 v32, 3, v19 -; VI-NEXT: v_add_u16_sdwa v19, v19, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v19, v32, v19 -; VI-NEXT: v_add_u16_e32 v32, 3, v18 -; VI-NEXT: v_add_u16_sdwa v18, v18, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v18, v32, v18 -; VI-NEXT: v_add_u16_e32 v32, 3, v17 -; VI-NEXT: v_add_u16_sdwa v17, v17, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v17, v32, v17 -; VI-NEXT: v_add_u16_sdwa v32, v16, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v16, 3, v16 -; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB13_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: s_branch .LBB31_2 ; -; GFX9-LABEL: bitcast_v64i16_to_v32i32: +; GFX9-LABEL: bitcast_v16i64_to_v32f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB13_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v32, vcc, 3, v32 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: s_branch .LBB31_2 ; -; GFX11-LABEL: bitcast_v64i16_to_v32i32: +; GFX11-LABEL: bitcast_v16i64_to_v32f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB13_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB31_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_3: +; GFX11-NEXT: .LBB31_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <64 x i16> %a, splat (i16 3) - %a2 = bitcast <64 x i16> %a1 to <32 x i32> + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <32 x float> br label %end cmp.false: - %a3 = bitcast <64 x i16> %a to <32 x i32> + %a3 = bitcast <16 x i64> %a to <32 x float> br label %end end: - %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <32 x i32> %phi + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi } -define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f32_to_v16i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB14_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <16 x double> @bitcast_v32f32_to_v16f64(<32 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v32f32_to_v16f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB32_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v32f32_to_v16i64: +; VI-LABEL: bitcast_v32f32_to_v16f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 @@ -18943,7 +40079,7 @@ define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 @@ -18978,12 +40114,12 @@ define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB14_2: ; %end +; VI-NEXT: .LBB32_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v32f32_to_v16i64: +; GFX9-LABEL: bitcast_v32f32_to_v16f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 @@ -18993,7 +40129,7 @@ define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 @@ -19028,12 +40164,12 @@ define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB14_2: ; %end +; GFX9-NEXT: .LBB32_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v32f32_to_v16i64: +; GFX11-LABEL: bitcast_v32f32_to_v16f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -19045,7 +40181,7 @@ define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 @@ -19064,7 +40200,7 @@ define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB14_2: ; %end +; GFX11-NEXT: .LBB32_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -19073,309 +40209,139 @@ define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { cmp.true: %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <32 x float> %a1 to <16 x i64> - br label %end - -cmp.false: - %a3 = bitcast <32 x float> %a to <16 x i64> - br label %end - -end: - %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <16 x i64> %phi -} - -define <32 x float> @bitcast_v16i64_to_v32f32(<16 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i64_to_v32f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v16i64_to_v32f32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB15_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v16i64_to_v32f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc -; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 -; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB15_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v16i64_to_v32f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo -; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB15_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <16 x i64> %a, splat (i64 3) - %a2 = bitcast <16 x i64> %a1 to <32 x float> + %a2 = bitcast <32 x float> %a1 to <16 x double> br label %end cmp.false: - %a3 = bitcast <16 x i64> %a to <32 x float> + %a3 = bitcast <32 x float> %a to <16 x double> br label %end end: - %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <32 x float> %phi + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi } -define <16 x double> @bitcast_v32f32_to_v16f64(<32 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f32_to_v16f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x double> @bitcast_v32f32_to_v16f64_scalar(<32 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f32_to_v16f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: s_branch .LBB33_2 ; -; VI-LABEL: bitcast_v32f32_to_v16f64: +; VI-LABEL: bitcast_v32f32_to_v16f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB33_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 ; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 ; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 @@ -19389,7 +40355,7 @@ define <16 x double> @bitcast_v32f32_to_v16f64(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -19408,24 +40374,53 @@ define <16 x double> @bitcast_v32f32_to_v16f64(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB16_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: s_branch .LBB33_2 ; -; GFX9-LABEL: bitcast_v32f32_to_v16f64: +; GFX9-LABEL: bitcast_v32f32_to_v16f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 ; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 ; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 @@ -19439,7 +40434,7 @@ define <16 x double> @bitcast_v32f32_to_v16f64(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 ; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -19458,26 +40453,44 @@ define <16 x double> @bitcast_v32f32_to_v16f64(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB16_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: s_branch .LBB33_2 ; -; GFX11-LABEL: bitcast_v32f32_to_v16f64: +; GFX11-LABEL: bitcast_v32f32_to_v16f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB33_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: .LBB33_4: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 ; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 ; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 @@ -19494,9 +40507,6 @@ define <16 x double> @bitcast_v32f32_to_v16f64(<32 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB16_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -19516,39 +40526,39 @@ end: } define <32 x float> @bitcast_v16f64_to_v32f32(<16 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f64_to_v32f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; GCN-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f64_to_v32f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB34_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v32f32: ; VI: ; %bb.0: @@ -19560,7 +40570,7 @@ define <32 x float> @bitcast_v16f64_to_v32f32(<16 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 @@ -19579,7 +40589,7 @@ define <32 x float> @bitcast_v16f64_to_v32f32(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -19594,7 +40604,7 @@ define <32 x float> @bitcast_v16f64_to_v32f32(<16 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 @@ -19613,7 +40623,7 @@ define <32 x float> @bitcast_v16f64_to_v32f32(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: .LBB34_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -19630,7 +40640,7 @@ define <32 x float> @bitcast_v16f64_to_v32f32(<16 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 @@ -19649,7 +40659,7 @@ define <32 x float> @bitcast_v16f64_to_v32f32(<16 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: .LBB34_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -19670,1228 +40680,1478 @@ end: ret <32 x float> %phi } +define inreg <32 x float> @bitcast_v16f64_to_v32f32_scalar(<16 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f64_to_v32f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v33, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: v_mov_b32_e32 v19, v33 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v16f64_to_v32f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v19, v33 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v16f64_to_v32f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: v_mov_b32_e32 v19, v33 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-LABEL: bitcast_v16f64_to_v32f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB35_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: .LBB35_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f32_to_v128i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v58 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v58 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v58 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 -; GCN-NEXT: .LBB18_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_f32_e32 v58, 1.0, v58 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v59, 1.0, v59 -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v58 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v58 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v58 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 -; GCN-NEXT: .LBB18_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v52 -; GCN-NEXT: v_or_b32_e32 v1, v1, v52 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_or_b32_e32 v2, v2, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v50, v31 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v31 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v50 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; GCN-NEXT: v_or_b32_e32 v31, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v61 -; GCN-NEXT: v_or_b32_e32 v49, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v48 -; GCN-NEXT: v_or_b32_e32 v2, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v60 -; GCN-NEXT: v_or_b32_e32 v61, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v39 -; GCN-NEXT: v_or_b32_e32 v62, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v57 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v38 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v37 -; GCN-NEXT: v_or_b32_e32 v7, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v47 -; GCN-NEXT: v_or_b32_e32 v8, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v36 -; GCN-NEXT: v_or_b32_e32 v9, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v46 -; GCN-NEXT: v_or_b32_e32 v10, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v35 -; GCN-NEXT: v_or_b32_e32 v11, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v45 -; GCN-NEXT: v_or_b32_e32 v12, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v34 -; GCN-NEXT: v_or_b32_e32 v13, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v44 -; GCN-NEXT: v_or_b32_e32 v14, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v33 -; GCN-NEXT: v_or_b32_e32 v15, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v43 -; GCN-NEXT: v_or_b32_e32 v16, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v21 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v17, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v42 -; GCN-NEXT: v_or_b32_e32 v18, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v23 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v19, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v41 -; GCN-NEXT: v_or_b32_e32 v20, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v25 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v21, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v40 -; GCN-NEXT: v_or_b32_e32 v22, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v27 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v23, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v55 -; GCN-NEXT: v_or_b32_e32 v24, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v29 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v25, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 -; GCN-NEXT: v_or_b32_e32 v26, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v59 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v27, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v53 -; GCN-NEXT: v_or_b32_e32 v28, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v29, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v32, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v30, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v33, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v34, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v35, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v36, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v37, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v38, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v39, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v48, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v50, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v51, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v52, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v53, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v54, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v55, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v40, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v41, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v42, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v43, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v44, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v45, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v46, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v56, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v57, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v58, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v59, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v60, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v4, v1, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v5, v1, v32 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v63, v2, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v61 -; GCN-NEXT: v_or_b32_e32 v61, v3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v6, v6, v34 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v35 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v36 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v30, v30, v37 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v38 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v39 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v8, v8, v48 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v50 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v11, v11, v52 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v12, v12, v53 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v13, v13, v54 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v55 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v15, v15, v40 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v16, v16, v41 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v17, v17, v42 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v43 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v19, v19, v44 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v45 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v21, v21, v46 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v47 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v23, v23, v56 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v57 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v25, v25, v58 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v59 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v27, v27, v60 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f32_to_v128i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v53, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v41, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v43, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v45, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; SI-NEXT: .LBB36_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v53, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v41, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v43, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v45, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; SI-NEXT: .LBB36_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v45 +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v45 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v41 +; SI-NEXT: v_or_b32_e32 v41, v41, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v61 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v58 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v47 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v39 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f32_to_v128i8: ; VI: ; %bb.0: @@ -21092,7 +42352,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: s_cbranch_execz .LBB36_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill @@ -21268,9 +42528,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 -; VI-NEXT: .LBB18_2: ; %Flow +; VI-NEXT: .LBB36_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_4 +; VI-NEXT: s_cbranch_execz .LBB36_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 @@ -21477,7 +42737,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 -; VI-NEXT: .LBB18_4: ; %end +; VI-NEXT: .LBB36_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v48 ; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -22069,7 +43329,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: s_cbranch_execz .LBB36_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 @@ -22263,9 +43523,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] -; GFX9-NEXT: .LBB18_2: ; %Flow +; GFX9-NEXT: .LBB36_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_4 +; GFX9-NEXT: s_cbranch_execz .LBB36_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 @@ -22491,7 +43751,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 -; GFX9-NEXT: .LBB18_4: ; %end +; GFX9-NEXT: .LBB36_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -22939,7 +44199,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] @@ -23006,9 +44266,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 -; GFX11-TRUE16-NEXT: .LBB18_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB36_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 1.0, v20 :: v_dual_add_f32 v19, 1.0, v19 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 1.0, v18 :: v_dual_add_f32 v17, 1.0, v17 @@ -23091,7 +44351,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 -; GFX11-TRUE16-NEXT: .LBB18_4: ; %end +; GFX11-TRUE16-NEXT: .LBB36_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -23540,7 +44800,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -23639,9 +44899,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] -; GFX11-FAKE16-NEXT: .LBB18_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB36_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB36_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v20, 1.0, v20 :: v_dual_add_f32 v19, 1.0, v19 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v18, 1.0, v18 :: v_dual_add_f32 v17, 1.0, v17 @@ -23756,7 +45016,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 -; GFX11-FAKE16-NEXT: .LBB18_4: ; %end +; GFX11-FAKE16-NEXT: .LBB36_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 @@ -24098,1621 +45358,7429 @@ end: ret <128 x i8> %phi } +define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f32_to_v128i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v56, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v47, s17 +; SI-NEXT: v_mov_b32_e32 v44, s18 +; SI-NEXT: v_mov_b32_e32 v42, s19 +; SI-NEXT: v_mov_b32_e32 v40, s20 +; SI-NEXT: v_mov_b32_e32 v53, s21 +; SI-NEXT: v_mov_b32_e32 v51, s22 +; SI-NEXT: v_mov_b32_e32 v48, s23 +; SI-NEXT: v_mov_b32_e32 v38, s24 +; SI-NEXT: v_mov_b32_e32 v35, s25 +; SI-NEXT: v_mov_b32_e32 v33, s26 +; SI-NEXT: v_mov_b32_e32 v30, s27 +; SI-NEXT: v_mov_b32_e32 v28, s28 +; SI-NEXT: v_mov_b32_e32 v25, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v14, v13, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v14, v13, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v14, v13, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v12, v11, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v12, v11, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v12, v11, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v10, v9, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v10, v9, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v10, v9, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v8, v7, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v8, v7, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v8, v7, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v4, v3, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v4, v3, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v4, v3, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v2, v1, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v2, v1, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v2, v1, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v25, v28, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v25, v28, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v25, v28, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v30, v33, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v30, v33, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v30, v33, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v35, v38, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v35, v38, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v35, v38, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v48, v51, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v48, v51, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v48, v51, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v14 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v14 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v10 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v2 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 +; SI-NEXT: v_alignbit_b32 v23, v53, v40, 24 +; SI-NEXT: v_alignbit_b32 v26, v53, v40, 16 +; SI-NEXT: v_alignbit_b32 v29, v53, v40, 8 +; SI-NEXT: v_alignbit_b32 v32, v42, v44, 24 +; SI-NEXT: v_alignbit_b32 v36, v42, v44, 16 +; SI-NEXT: v_alignbit_b32 v39, v42, v44, 8 +; SI-NEXT: v_alignbit_b32 v50, v47, v56, 24 +; SI-NEXT: v_alignbit_b32 v54, v47, v56, 16 +; SI-NEXT: v_alignbit_b32 v41, v47, v56, 8 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v30 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v30 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v35 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v48 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v48 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v53 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v53 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v42 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v47 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v47 +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v14, v13, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v14, v13, 16 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v14, v13, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v12, v11, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v12, v11, 16 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v12, v11, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v10, v9, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v10, v9, 16 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v10, v9, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v8, v7, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v8, v7, 16 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v8, v7, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v4, v3, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v4, v3, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v2, v1, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v2, v1, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v25, v28, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v25, v28, 16 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v25, v28, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v30, v33, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v30, v33, 16 +; SI-NEXT: v_add_f32_e32 v35, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v38, 1.0, v38 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v30, v33, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v35, v38, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v35, v38, 16 +; SI-NEXT: v_add_f32_e32 v48, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v51, 1.0, v51 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v35, v38, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v48, v51, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v48, v51, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v48, v51, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v14 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v14 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v10 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v2 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_add_f32_e32 v47, 1.0, v47 +; SI-NEXT: v_add_f32_e32 v56, 1.0, v56 +; SI-NEXT: v_add_f32_e32 v42, 1.0, v42 +; SI-NEXT: v_add_f32_e32 v44, 1.0, v44 +; SI-NEXT: v_add_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_add_f32_e32 v40, 1.0, v40 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 +; SI-NEXT: v_alignbit_b32 v23, v53, v40, 24 +; SI-NEXT: v_alignbit_b32 v26, v53, v40, 16 +; SI-NEXT: v_alignbit_b32 v29, v53, v40, 8 +; SI-NEXT: v_alignbit_b32 v32, v42, v44, 24 +; SI-NEXT: v_alignbit_b32 v36, v42, v44, 16 +; SI-NEXT: v_alignbit_b32 v39, v42, v44, 8 +; SI-NEXT: v_alignbit_b32 v50, v47, v56, 24 +; SI-NEXT: v_alignbit_b32 v54, v47, v56, 16 +; SI-NEXT: v_alignbit_b32 v41, v47, v56, 8 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v30 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v30 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v35 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v48 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v48 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v53 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v53 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v42 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v47 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v47 +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v41 +; SI-NEXT: v_and_b32_e32 v54, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v41, v56, v41 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v50 +; SI-NEXT: v_or_b32_e32 v50, v50, v54 +; SI-NEXT: v_and_b32_e32 v54, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v50, v54, v50 +; SI-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v50, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v21, v50, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v32 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v63 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v61 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v29 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v23 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v60 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v58 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v57 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v45 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v38 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v43 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v52 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v33 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v49 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v34 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v28 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v31 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v24 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_branch .LBB37_2 +; +; VI-LABEL: bitcast_v32f32_to_v128i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v63, s30, 0 +; VI-NEXT: v_writelane_b32 v63, s31, 1 +; VI-NEXT: v_writelane_b32 v63, s34, 2 +; VI-NEXT: v_writelane_b32 v63, s35, 3 +; VI-NEXT: v_writelane_b32 v63, s36, 4 +; VI-NEXT: v_writelane_b32 v63, s37, 5 +; VI-NEXT: v_writelane_b32 v63, s38, 6 +; VI-NEXT: v_writelane_b32 v63, s39, 7 +; VI-NEXT: v_writelane_b32 v63, s48, 8 +; VI-NEXT: v_writelane_b32 v63, s49, 9 +; VI-NEXT: v_writelane_b32 v63, s50, 10 +; VI-NEXT: v_writelane_b32 v63, s51, 11 +; VI-NEXT: v_writelane_b32 v63, s52, 12 +; VI-NEXT: v_writelane_b32 v63, s53, 13 +; VI-NEXT: v_writelane_b32 v63, s54, 14 +; VI-NEXT: v_writelane_b32 v63, s55, 15 +; VI-NEXT: v_writelane_b32 v63, s64, 16 +; VI-NEXT: v_writelane_b32 v63, s65, 17 +; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_writelane_b32 v63, s68, 20 +; VI-NEXT: v_writelane_b32 v63, s69, 21 +; VI-NEXT: v_writelane_b32 v63, s70, 22 +; VI-NEXT: v_writelane_b32 v63, s71, 23 +; VI-NEXT: v_writelane_b32 v63, s80, 24 +; VI-NEXT: v_writelane_b32 v63, s81, 25 +; VI-NEXT: v_writelane_b32 v63, s82, 26 +; VI-NEXT: v_writelane_b32 v63, s83, 27 +; VI-NEXT: v_writelane_b32 v63, s84, 28 +; VI-NEXT: v_writelane_b32 v63, s85, 29 +; VI-NEXT: v_writelane_b32 v63, s86, 30 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_writelane_b32 v63, s87, 31 +; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: v_readfirstlane_b32 s45, v2 +; VI-NEXT: v_readfirstlane_b32 s42, v3 +; VI-NEXT: v_readfirstlane_b32 s43, v4 +; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s41, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s15, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s11, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s9, v14 +; VI-NEXT: v_readfirstlane_b32 s6, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v16 +; VI-NEXT: v_readfirstlane_b32 s4, v17 +; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v18 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; VI-NEXT: s_cbranch_scc0 .LBB37_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 57 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s12, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s41, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s41, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s41, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s40, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s40, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s43, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s43, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s43, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s42, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s42, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s44, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 7 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 6 +; VI-NEXT: s_lshr_b32 s46, s27, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 5 +; VI-NEXT: s_lshr_b32 s46, s26, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 4 +; VI-NEXT: s_lshr_b32 s46, s26, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 3 +; VI-NEXT: s_lshr_b32 s46, s25, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 2 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 1 +; VI-NEXT: s_lshr_b32 s46, s24, 16 +; VI-NEXT: s_lshr_b32 s80, s25, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 0 +; VI-NEXT: s_lshr_b32 s81, s24, 8 +; VI-NEXT: s_lshr_b32 s82, s23, 24 +; VI-NEXT: s_lshr_b32 s83, s23, 16 +; VI-NEXT: s_lshr_b32 s85, s23, 8 +; VI-NEXT: s_lshr_b32 s84, s22, 16 +; VI-NEXT: s_lshr_b32 s86, s22, 8 +; VI-NEXT: s_lshr_b32 s87, s21, 24 +; VI-NEXT: s_lshr_b32 s50, s21, 16 +; VI-NEXT: s_lshr_b32 s52, s21, 8 +; VI-NEXT: s_lshr_b32 s51, s20, 16 +; VI-NEXT: s_lshr_b32 s53, s20, 8 +; VI-NEXT: s_lshr_b32 s54, s19, 24 +; VI-NEXT: s_lshr_b32 s55, s19, 16 +; VI-NEXT: s_lshr_b32 s65, s19, 8 +; VI-NEXT: s_lshr_b32 s64, s18, 16 +; VI-NEXT: s_lshr_b32 s66, s18, 8 +; VI-NEXT: s_lshr_b32 s67, s17, 24 +; VI-NEXT: s_lshr_b32 s68, s17, 16 +; VI-NEXT: s_lshr_b32 s70, s17, 8 +; VI-NEXT: s_lshr_b32 s69, s16, 16 +; VI-NEXT: s_lshr_b32 s71, s16, 8 +; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB37_4 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v2, s5, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s4, 1.0 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] +; VI-NEXT: v_add_f32_e64 v4, s7, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s6, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] +; VI-NEXT: v_add_f32_e64 v6, s9, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s8, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; VI-NEXT: v_add_f32_e64 v8, s11, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s10, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; VI-NEXT: v_add_f32_e64 v10, s13, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s12, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; VI-NEXT: v_add_f32_e64 v12, s15, 1.0 +; VI-NEXT: v_add_f32_e64 v11, s14, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; VI-NEXT: v_add_f32_e64 v14, s41, 1.0 +; VI-NEXT: v_add_f32_e64 v13, s40, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; VI-NEXT: v_add_f32_e64 v16, s43, 1.0 +; VI-NEXT: v_add_f32_e64 v15, s42, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; VI-NEXT: v_add_f32_e64 v18, s45, 1.0 +; VI-NEXT: v_add_f32_e64 v17, s44, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; VI-NEXT: v_add_f32_e64 v20, s29, 1.0 +; VI-NEXT: v_add_f32_e64 v19, s28, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; VI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v21, s26, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_add_f32_e64 v28, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v27, s20, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: v_add_f32_e64 v30, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v29, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v24, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v23, s24, 1.0 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_add_f32_e64 v32, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v31, s16, 1.0 +; VI-NEXT: v_add_f32_e64 v26, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v25, s22, 1.0 +; VI-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v23 +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v26 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v28 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v28 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v30 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v30 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v29 +; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v32 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v31 +; VI-NEXT: s_branch .LBB37_5 +; VI-NEXT: .LBB37_3: +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB37_2 +; VI-NEXT: .LBB37_4: +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 0 +; VI-NEXT: v_mov_b32_e32 v48, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 1 +; VI-NEXT: v_mov_b32_e32 v36, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 2 +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v36, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 3 +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 4 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 5 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 6 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 7 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 8 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 9 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 10 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 11 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 12 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 13 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 14 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 15 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 16 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 17 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 18 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 19 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 20 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 21 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 22 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 23 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 24 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 25 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 26 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 27 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 28 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 29 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 30 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 31 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 32 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 33 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 34 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 35 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 36 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 37 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 38 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 40 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 41 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 42 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 43 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 44 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 45 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 46 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 47 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 48 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 49 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 50 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 51 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 52 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 53 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 54 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 55 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 56 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 57 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_mov_b32_e32 v53, s46 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s56 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s58 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s60 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s62 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s72 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s74 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s76 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s78 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s88 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s90 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v31, s16 +; VI-NEXT: v_mov_b32_e32 v32, s17 +; VI-NEXT: v_mov_b32_e32 v29, s18 +; VI-NEXT: v_mov_b32_e32 v30, s19 +; VI-NEXT: v_mov_b32_e32 v27, s20 +; VI-NEXT: v_mov_b32_e32 v28, s21 +; VI-NEXT: v_mov_b32_e32 v25, s22 +; VI-NEXT: v_mov_b32_e32 v26, s23 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s25 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v20, s29 +; VI-NEXT: v_mov_b32_e32 v17, s44 +; VI-NEXT: v_mov_b32_e32 v18, s45 +; VI-NEXT: v_mov_b32_e32 v15, s42 +; VI-NEXT: v_mov_b32_e32 v16, s43 +; VI-NEXT: v_mov_b32_e32 v13, s40 +; VI-NEXT: v_mov_b32_e32 v14, s41 +; VI-NEXT: v_mov_b32_e32 v11, s14 +; VI-NEXT: v_mov_b32_e32 v12, s15 +; VI-NEXT: v_mov_b32_e32 v9, s12 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v7, s10 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v5, s8 +; VI-NEXT: v_mov_b32_e32 v6, s9 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v35, s71 +; VI-NEXT: v_mov_b32_e32 v61, s69 +; VI-NEXT: v_mov_b32_e32 v34, s70 +; VI-NEXT: v_mov_b32_e32 v33, s68 +; VI-NEXT: v_mov_b32_e32 v60, s67 +; VI-NEXT: v_mov_b32_e32 v52, s66 +; VI-NEXT: v_mov_b32_e32 v59, s64 +; VI-NEXT: v_mov_b32_e32 v58, s65 +; VI-NEXT: v_mov_b32_e32 v57, s55 +; VI-NEXT: v_mov_b32_e32 v49, s54 +; VI-NEXT: v_mov_b32_e32 v47, s53 +; VI-NEXT: v_mov_b32_e32 v56, s51 +; VI-NEXT: v_mov_b32_e32 v38, s52 +; VI-NEXT: v_mov_b32_e32 v51, s50 +; VI-NEXT: v_mov_b32_e32 v46, s87 +; VI-NEXT: v_mov_b32_e32 v44, s86 +; VI-NEXT: v_mov_b32_e32 v45, s84 +; VI-NEXT: v_mov_b32_e32 v43, s85 +; VI-NEXT: v_mov_b32_e32 v55, s83 +; VI-NEXT: v_mov_b32_e32 v42, s82 +; VI-NEXT: v_mov_b32_e32 v37, s81 +; VI-NEXT: v_mov_b32_e32 v50, s80 +; VI-NEXT: v_mov_b32_e32 v53, s30 +; VI-NEXT: v_mov_b32_e32 v54, s34 +; VI-NEXT: v_mov_b32_e32 v39, s36 +; VI-NEXT: v_mov_b32_e32 v40, s38 +; VI-NEXT: v_mov_b32_e32 v41, s48 +; VI-NEXT: .LBB37_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v34 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; VI-NEXT: v_or_b32_sdwa v32, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v41 +; VI-NEXT: v_or_b32_sdwa v31, v31, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v61, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v60 +; VI-NEXT: v_or_b32_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v32, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v40 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v52 +; VI-NEXT: v_or_b32_sdwa v31, v59, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v31, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v58 +; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v49 +; VI-NEXT: v_or_b32_sdwa v30, v57, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v30, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v39 +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v47 +; VI-NEXT: v_or_b32_sdwa v29, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v29, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v38 +; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v46 +; VI-NEXT: v_or_b32_sdwa v28, v51, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v28, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v54 +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v44 +; VI-NEXT: v_or_b32_sdwa v27, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v27, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v43 +; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v42 +; VI-NEXT: v_or_b32_sdwa v26, v55, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v26, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v53 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v37 +; VI-NEXT: v_or_b32_sdwa v25, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v25, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v50 +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v36 +; VI-NEXT: v_readlane_b32 s87, v63, 31 +; VI-NEXT: v_readlane_b32 s86, v63, 30 +; VI-NEXT: v_readlane_b32 s85, v63, 29 +; VI-NEXT: v_readlane_b32 s84, v63, 28 +; VI-NEXT: v_readlane_b32 s83, v63, 27 +; VI-NEXT: v_readlane_b32 s82, v63, 26 +; VI-NEXT: v_readlane_b32 s81, v63, 25 +; VI-NEXT: v_readlane_b32 s80, v63, 24 +; VI-NEXT: v_readlane_b32 s71, v63, 23 +; VI-NEXT: v_readlane_b32 s70, v63, 22 +; VI-NEXT: v_readlane_b32 s69, v63, 21 +; VI-NEXT: v_readlane_b32 s68, v63, 20 +; VI-NEXT: v_readlane_b32 s67, v63, 19 +; VI-NEXT: v_readlane_b32 s66, v63, 18 +; VI-NEXT: v_readlane_b32 s65, v63, 17 +; VI-NEXT: v_readlane_b32 s64, v63, 16 +; VI-NEXT: v_readlane_b32 s55, v63, 15 +; VI-NEXT: v_readlane_b32 s54, v63, 14 +; VI-NEXT: v_readlane_b32 s53, v63, 13 +; VI-NEXT: v_readlane_b32 s52, v63, 12 +; VI-NEXT: v_readlane_b32 s51, v63, 11 +; VI-NEXT: v_readlane_b32 s50, v63, 10 +; VI-NEXT: v_readlane_b32 s49, v63, 9 +; VI-NEXT: v_readlane_b32 s48, v63, 8 +; VI-NEXT: v_readlane_b32 s39, v63, 7 +; VI-NEXT: v_readlane_b32 s38, v63, 6 +; VI-NEXT: v_readlane_b32 s37, v63, 5 +; VI-NEXT: v_readlane_b32 s36, v63, 4 +; VI-NEXT: v_readlane_b32 s35, v63, 3 +; VI-NEXT: v_readlane_b32 s34, v63, 2 +; VI-NEXT: v_readlane_b32 s31, v63, 1 +; VI-NEXT: v_readlane_b32 s30, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v24, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; VI-NEXT: v_or_b32_sdwa v21, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v23, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v22, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; VI-NEXT: v_or_b32_sdwa v19, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v21, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v20, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_or_b32_sdwa v17, v17, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v19, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v17, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; VI-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f32_to_v128i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v63, s30, 0 +; GFX9-NEXT: v_writelane_b32 v63, s31, 1 +; GFX9-NEXT: v_writelane_b32 v63, s34, 2 +; GFX9-NEXT: v_writelane_b32 v63, s35, 3 +; GFX9-NEXT: v_writelane_b32 v63, s36, 4 +; GFX9-NEXT: v_writelane_b32 v63, s37, 5 +; GFX9-NEXT: v_writelane_b32 v63, s38, 6 +; GFX9-NEXT: v_writelane_b32 v63, s39, 7 +; GFX9-NEXT: v_writelane_b32 v63, s48, 8 +; GFX9-NEXT: v_writelane_b32 v63, s49, 9 +; GFX9-NEXT: v_writelane_b32 v63, s50, 10 +; GFX9-NEXT: v_writelane_b32 v63, s51, 11 +; GFX9-NEXT: v_writelane_b32 v63, s52, 12 +; GFX9-NEXT: v_writelane_b32 v63, s53, 13 +; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_writelane_b32 v63, s64, 16 +; GFX9-NEXT: v_writelane_b32 v63, s65, 17 +; GFX9-NEXT: v_writelane_b32 v63, s66, 18 +; GFX9-NEXT: v_writelane_b32 v63, s67, 19 +; GFX9-NEXT: v_writelane_b32 v63, s68, 20 +; GFX9-NEXT: v_writelane_b32 v63, s69, 21 +; GFX9-NEXT: v_writelane_b32 v63, s70, 22 +; GFX9-NEXT: v_writelane_b32 v63, s71, 23 +; GFX9-NEXT: v_writelane_b32 v63, s80, 24 +; GFX9-NEXT: v_writelane_b32 v63, s81, 25 +; GFX9-NEXT: v_writelane_b32 v63, s82, 26 +; GFX9-NEXT: v_writelane_b32 v63, s83, 27 +; GFX9-NEXT: v_writelane_b32 v63, s84, 28 +; GFX9-NEXT: v_writelane_b32 v63, s85, 29 +; GFX9-NEXT: v_writelane_b32 v63, s86, 30 +; GFX9-NEXT: v_writelane_b32 v63, s87, 31 +; GFX9-NEXT: v_writelane_b32 v63, s96, 32 +; GFX9-NEXT: v_writelane_b32 v63, s97, 33 +; GFX9-NEXT: v_writelane_b32 v63, s98, 34 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_writelane_b32 v63, s99, 35 +; GFX9-NEXT: v_readfirstlane_b32 s44, v1 +; GFX9-NEXT: v_readfirstlane_b32 s45, v2 +; GFX9-NEXT: v_readfirstlane_b32 s42, v3 +; GFX9-NEXT: v_readfirstlane_b32 s43, v4 +; GFX9-NEXT: v_readfirstlane_b32 s40, v5 +; GFX9-NEXT: v_readfirstlane_b32 s41, v6 +; GFX9-NEXT: v_readfirstlane_b32 s14, v7 +; GFX9-NEXT: v_readfirstlane_b32 s15, v8 +; GFX9-NEXT: v_readfirstlane_b32 s12, v9 +; GFX9-NEXT: v_readfirstlane_b32 s13, v10 +; GFX9-NEXT: v_readfirstlane_b32 s10, v11 +; GFX9-NEXT: v_readfirstlane_b32 s11, v12 +; GFX9-NEXT: v_readfirstlane_b32 s8, v13 +; GFX9-NEXT: v_readfirstlane_b32 s9, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: v_readfirstlane_b32 s7, v16 +; GFX9-NEXT: v_readfirstlane_b32 s4, v17 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v18 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; GFX9-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s5, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 49 +; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s5, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 47 +; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 46 +; GFX9-NEXT: s_lshr_b32 s46, s4, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 45 +; GFX9-NEXT: s_lshr_b32 s46, s7, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 44 +; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 43 +; GFX9-NEXT: s_lshr_b32 s46, s7, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 42 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s6, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s8, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s8, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s11, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s11, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s10, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s10, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s13, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s13, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s12, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s12, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s15, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s15, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s14, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s14, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s41, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s41, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s40, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s40, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s43, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s43, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s43, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 12 +; GFX9-NEXT: s_lshr_b32 s46, s42, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 11 +; GFX9-NEXT: s_lshr_b32 s46, s42, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 10 +; GFX9-NEXT: s_lshr_b32 s46, s45, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 9 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 8 +; GFX9-NEXT: s_lshr_b32 s46, s45, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 7 +; GFX9-NEXT: s_lshr_b32 s46, s44, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 6 +; GFX9-NEXT: s_lshr_b32 s46, s44, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 5 +; GFX9-NEXT: s_lshr_b32 s46, s29, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 4 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 3 +; GFX9-NEXT: s_lshr_b32 s46, s29, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 2 +; GFX9-NEXT: s_lshr_b32 s46, s28, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 1 +; GFX9-NEXT: s_lshr_b32 s46, s28, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 0 +; GFX9-NEXT: s_lshr_b32 s82, s27, 24 +; GFX9-NEXT: s_lshr_b32 s83, s27, 16 +; GFX9-NEXT: s_lshr_b32 s85, s27, 8 +; GFX9-NEXT: s_lshr_b32 s84, s26, 16 +; GFX9-NEXT: s_lshr_b32 s86, s26, 8 +; GFX9-NEXT: s_lshr_b32 s87, s25, 24 +; GFX9-NEXT: s_lshr_b32 s96, s25, 16 +; GFX9-NEXT: s_lshr_b32 s98, s25, 8 +; GFX9-NEXT: s_lshr_b32 s97, s24, 16 +; GFX9-NEXT: s_lshr_b32 s99, s24, 8 +; GFX9-NEXT: s_lshr_b32 s38, s23, 24 +; GFX9-NEXT: s_lshr_b32 s39, s23, 16 +; GFX9-NEXT: s_lshr_b32 s49, s23, 8 +; GFX9-NEXT: s_lshr_b32 s48, s22, 16 +; GFX9-NEXT: s_lshr_b32 s50, s22, 8 +; GFX9-NEXT: s_lshr_b32 s51, s21, 24 +; GFX9-NEXT: s_lshr_b32 s52, s21, 16 +; GFX9-NEXT: s_lshr_b32 s54, s21, 8 +; GFX9-NEXT: s_lshr_b32 s53, s20, 16 +; GFX9-NEXT: s_lshr_b32 s55, s20, 8 +; GFX9-NEXT: s_lshr_b32 s64, s19, 24 +; GFX9-NEXT: s_lshr_b32 s65, s19, 16 +; GFX9-NEXT: s_lshr_b32 s67, s19, 8 +; GFX9-NEXT: s_lshr_b32 s66, s18, 16 +; GFX9-NEXT: s_lshr_b32 s68, s18, 8 +; GFX9-NEXT: s_lshr_b32 s69, s17, 24 +; GFX9-NEXT: s_lshr_b32 s70, s17, 16 +; GFX9-NEXT: s_lshr_b32 s80, s17, 8 +; GFX9-NEXT: s_lshr_b32 s71, s16, 16 +; GFX9-NEXT: s_lshr_b32 s81, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB37_4 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v2, s5, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s4, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_add_f32_e64 v4, s7, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s6, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_add_f32_e64 v6, s9, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s8, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX9-NEXT: v_add_f32_e64 v8, s11, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s10, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX9-NEXT: v_add_f32_e64 v10, s13, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s12, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: v_add_f32_e64 v12, s15, 1.0 +; GFX9-NEXT: v_add_f32_e64 v11, s14, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] +; GFX9-NEXT: v_add_f32_e64 v14, s41, 1.0 +; GFX9-NEXT: v_add_f32_e64 v13, s40, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] +; GFX9-NEXT: v_add_f32_e64 v23, s43, 1.0 +; GFX9-NEXT: v_add_f32_e64 v22, s42, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[22:23] +; GFX9-NEXT: v_add_f32_e64 v25, s45, 1.0 +; GFX9-NEXT: v_add_f32_e64 v24, s44, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[24:25] +; GFX9-NEXT: v_add_f32_e64 v27, s29, 1.0 +; GFX9-NEXT: v_add_f32_e64 v26, s28, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[26:27] +; GFX9-NEXT: v_add_f32_e64 v29, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v28, s26, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[28:29] +; GFX9-NEXT: v_add_f32_e64 v31, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v30, s24, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[30:31] +; GFX9-NEXT: v_add_f32_e64 v33, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v32, s22, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[32:33] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v2 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v2 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v1 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v3 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v5 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v23 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v25 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v27 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v29 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v29 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v31 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v30 +; GFX9-NEXT: v_add_f32_e64 v35, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v34, s20, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v33 +; GFX9-NEXT: v_add_f32_e64 v37, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v36, s18, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[34:35] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v33 +; GFX9-NEXT: v_add_f32_e64 v39, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v38, s16, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[36:37] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v32 +; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[38:39] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v38 +; GFX9-NEXT: s_branch .LBB37_5 +; GFX9-NEXT: .LBB37_3: +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr99 +; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr96 +; GFX9-NEXT: ; implicit-def: $sgpr87 +; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr85 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: s_branch .LBB37_2 +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: v_mov_b32_e32 v52, s48 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s39 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s38 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s97 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s96 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s87 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s84 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s83 +; GFX9-NEXT: v_readlane_b32 s4, v62, 0 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s82 +; GFX9-NEXT: v_mov_b32_e32 v46, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 1 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 2 +; GFX9-NEXT: v_mov_b32_e32 v45, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 3 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 4 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 5 +; GFX9-NEXT: v_mov_b32_e32 v44, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 6 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 7 +; GFX9-NEXT: v_mov_b32_e32 v43, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 8 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 9 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 10 +; GFX9-NEXT: v_mov_b32_e32 v55, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 11 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 12 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 13 +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 14 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 15 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 17 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 18 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 19 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 20 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 21 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 22 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 23 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 24 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 25 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 26 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 27 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 28 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 30 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 31 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 32 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 33 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 34 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 35 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 36 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 37 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 38 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 39 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 41 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 42 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 43 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 44 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 45 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 46 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 47 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 48 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 49 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_mov_b32_e32 v49, s52 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s46 +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s56 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s58 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s60 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s62 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s72 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s74 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s76 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s78 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s88 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s90 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s92 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s94 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v38, s16 +; GFX9-NEXT: v_mov_b32_e32 v39, s17 +; GFX9-NEXT: v_mov_b32_e32 v36, s18 +; GFX9-NEXT: v_mov_b32_e32 v37, s19 +; GFX9-NEXT: v_mov_b32_e32 v34, s20 +; GFX9-NEXT: v_mov_b32_e32 v35, s21 +; GFX9-NEXT: v_mov_b32_e32 v32, s22 +; GFX9-NEXT: v_mov_b32_e32 v33, s23 +; GFX9-NEXT: v_mov_b32_e32 v30, s24 +; GFX9-NEXT: v_mov_b32_e32 v31, s25 +; GFX9-NEXT: v_mov_b32_e32 v28, s26 +; GFX9-NEXT: v_mov_b32_e32 v29, s27 +; GFX9-NEXT: v_mov_b32_e32 v26, s28 +; GFX9-NEXT: v_mov_b32_e32 v27, s29 +; GFX9-NEXT: v_mov_b32_e32 v24, s44 +; GFX9-NEXT: v_mov_b32_e32 v25, s45 +; GFX9-NEXT: v_mov_b32_e32 v22, s42 +; GFX9-NEXT: v_mov_b32_e32 v23, s43 +; GFX9-NEXT: v_mov_b32_e32 v13, s40 +; GFX9-NEXT: v_mov_b32_e32 v14, s41 +; GFX9-NEXT: v_mov_b32_e32 v11, s14 +; GFX9-NEXT: v_mov_b32_e32 v12, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: v_mov_b32_e32 v10, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v48, s81 +; GFX9-NEXT: v_mov_b32_e32 v21, s71 +; GFX9-NEXT: v_mov_b32_e32 v16, s80 +; GFX9-NEXT: v_mov_b32_e32 v19, s70 +; GFX9-NEXT: v_mov_b32_e32 v20, s69 +; GFX9-NEXT: v_mov_b32_e32 v15, s68 +; GFX9-NEXT: v_mov_b32_e32 v18, s66 +; GFX9-NEXT: v_mov_b32_e32 v61, s67 +; GFX9-NEXT: v_mov_b32_e32 v51, s65 +; GFX9-NEXT: v_mov_b32_e32 v17, s64 +; GFX9-NEXT: v_mov_b32_e32 v54, s55 +; GFX9-NEXT: v_mov_b32_e32 v50, s53 +; GFX9-NEXT: v_mov_b32_e32 v60, s54 +; GFX9-NEXT: v_mov_b32_e32 v49, s51 +; GFX9-NEXT: v_mov_b32_e32 v59, s50 +; GFX9-NEXT: v_mov_b32_e32 v58, s49 +; GFX9-NEXT: v_mov_b32_e32 v57, s99 +; GFX9-NEXT: v_mov_b32_e32 v53, s98 +; GFX9-NEXT: v_mov_b32_e32 v56, s86 +; GFX9-NEXT: v_mov_b32_e32 v47, s85 +; GFX9-NEXT: v_mov_b32_e32 v40, s30 +; GFX9-NEXT: v_mov_b32_e32 v41, s34 +; GFX9-NEXT: v_mov_b32_e32 v42, s36 +; GFX9-NEXT: .LBB37_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v15, v36, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v34, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v35, v35, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v33, v33, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v57 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v53 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v16, v39, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v46 +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v43 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v15, v51, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v15, v36, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v42 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v40 +; GFX9-NEXT: v_or_b32_sdwa v38, v38, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v50, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v34, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v49 +; GFX9-NEXT: v_readlane_b32 s99, v63, 35 +; GFX9-NEXT: v_readlane_b32 s98, v63, 34 +; GFX9-NEXT: v_readlane_b32 s97, v63, 33 +; GFX9-NEXT: v_readlane_b32 s96, v63, 32 +; GFX9-NEXT: v_readlane_b32 s87, v63, 31 +; GFX9-NEXT: v_readlane_b32 s86, v63, 30 +; GFX9-NEXT: v_readlane_b32 s85, v63, 29 +; GFX9-NEXT: v_readlane_b32 s84, v63, 28 +; GFX9-NEXT: v_readlane_b32 s83, v63, 27 +; GFX9-NEXT: v_readlane_b32 s82, v63, 26 +; GFX9-NEXT: v_readlane_b32 s81, v63, 25 +; GFX9-NEXT: v_readlane_b32 s80, v63, 24 +; GFX9-NEXT: v_readlane_b32 s71, v63, 23 +; GFX9-NEXT: v_readlane_b32 s70, v63, 22 +; GFX9-NEXT: v_readlane_b32 s69, v63, 21 +; GFX9-NEXT: v_readlane_b32 s68, v63, 20 +; GFX9-NEXT: v_readlane_b32 s67, v63, 19 +; GFX9-NEXT: v_readlane_b32 s66, v63, 18 +; GFX9-NEXT: v_readlane_b32 s65, v63, 17 +; GFX9-NEXT: v_readlane_b32 s64, v63, 16 +; GFX9-NEXT: v_readlane_b32 s55, v63, 15 +; GFX9-NEXT: v_readlane_b32 s54, v63, 14 +; GFX9-NEXT: v_readlane_b32 s53, v63, 13 +; GFX9-NEXT: v_readlane_b32 s52, v63, 12 +; GFX9-NEXT: v_readlane_b32 s51, v63, 11 +; GFX9-NEXT: v_readlane_b32 s50, v63, 10 +; GFX9-NEXT: v_readlane_b32 s49, v63, 9 +; GFX9-NEXT: v_readlane_b32 s48, v63, 8 +; GFX9-NEXT: v_readlane_b32 s39, v63, 7 +; GFX9-NEXT: v_readlane_b32 s38, v63, 6 +; GFX9-NEXT: v_readlane_b32 s37, v63, 5 +; GFX9-NEXT: v_readlane_b32 s36, v63, 4 +; GFX9-NEXT: v_readlane_b32 s35, v63, 3 +; GFX9-NEXT: v_readlane_b32 s34, v63, 2 +; GFX9-NEXT: v_readlane_b32 s31, v63, 1 +; GFX9-NEXT: v_readlane_b32 s30, v63, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v30, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v28, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v32f32_to_v128i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s30, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s96, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s97, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s98, 2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s99, 3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s100, 4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s101, 5 +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_clause 0x11 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s102, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s103, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s104, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s50, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s51, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s52, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s53, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s54, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s55, 15 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s64, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s65, 17 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s66, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s67, 19 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s68, 20 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s69, 21 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s70, 22 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s71, 23 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s80, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s81, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s82, 26 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s83, 27 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s84, 28 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s85, 29 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s86, 30 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s87, 31 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s26, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s25, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s23, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s22, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s21, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s20, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s19, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s17, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s17, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s16, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s3, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s1, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s0, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 26 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[8:9], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[4:5], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 25 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[14:15], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[40:41], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[28:29], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 19 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 11 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s40, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[42:43], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-TRUE16-NEXT: .LBB37_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v31, s25, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v30, s24, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, s23, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, s22, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s21, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s20, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v37, s19, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v36, s18, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v39, s17, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v38, s16, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, s5, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, s4, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v51, s3, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v50, s2, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, s15, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, s14, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, s9, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, s8, 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[30:31] +; GFX11-TRUE16-NEXT: v_add_f32_e64 v55, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v54, s0, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v29, s27, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v28, s26, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v24, s29, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v23, s28, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v22, s41, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v21, s40, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, s7, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, s11, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, s13, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, s12, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, s10, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, s6, 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[36:37] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[32:33], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[38:39] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[50:51] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[28:29] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[54:55] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 16, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 24, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 24, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 8, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 24, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 16, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 24, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 16, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 16, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v54 +; GFX11-TRUE16-NEXT: s_branch .LBB37_5 +; GFX11-TRUE16-NEXT: .LBB37_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 10 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 11 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 12 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 13 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 14 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 15 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 17 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 18 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 19 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 20 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 21 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 22 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 23 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 24 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 25 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 26 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 27 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 28 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 29 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 30 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 31 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 8 +; GFX11-TRUE16-NEXT: s_branch .LBB37_2 +; GFX11-TRUE16-NEXT: .LBB37_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, s34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, s104 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, s103 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, s102 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, s101 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, s100 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, s99 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, s98 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, s97 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, s96 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, s87 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, s86 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, s85 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, s84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, s83 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, s82 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, s81 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, s80 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.l, s71 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.l, s70 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 13 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.l, s69 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.l, s68 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.l, s67 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.l, s66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 14 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.l, s65 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.l, s64 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.l, s55 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.l, s54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 15 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.l, s53 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.l, s52 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.l, s51 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 16 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, s39 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 17 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s72 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s74 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s76 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s78 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s88 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s90 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s92 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s30 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s42 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 30 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s0 +; GFX11-TRUE16-NEXT: .LBB37_5: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v61, 8, v61 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, v54, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xff, v72 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xff, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v60, 8, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v59, 0xff, v59 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, v71, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v63 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v50, v60 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v59, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v70 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, v55, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xff, v62 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v57, 0xff, v57 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v56, 8, v56 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, v54, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, v71, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v46, 0xff, v46 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, v55, v70 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v51, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, v57, v56 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v56, v50, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v68, v46, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v70 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v38, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xff, v44 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v43 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v57, v50, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, v39, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v70, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xff, v41 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v38, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v68, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v70, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v40 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v183 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v182 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v67, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v69, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xff, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v68, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v179 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v38, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, v50, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v178 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v177 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v166 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v51, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xff, v167 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[54:57], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[36:39], off offset:16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v150 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, v68, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v163 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xff, v162 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 8, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xff, v160 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v151 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v37, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v50, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v149 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v148 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v147 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xff, v144 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v29, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, v51, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v55, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v30, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v38, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v51, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v23, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v135 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xff, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 8, v132 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v130 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, v48, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v22, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xff, v129 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v128 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 8, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xff, v118 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v117 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v37, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v17, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, v48, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v51, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v21, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v23, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, v22, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v17, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v18, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v112 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v102 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v18, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v32, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v98 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v97 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v48, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 8, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v32, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v14, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v9, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xff, v165 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v164 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 8, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v25, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v70, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v80 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v25, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v20, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v67, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v68, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v19, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v20, v16 +; GFX11-TRUE16-NEXT: s_clause 0x5 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[28:31], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[36:39], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 +; GFX11-TRUE16-NEXT: s_clause 0x11 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:68 +; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v75, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v75, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v75, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v75, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v75, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v75, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v75, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v75, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v75, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v74, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v74, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v74, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v74, 28 +; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v74, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v74, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v74, 25 +; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v74, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v74, 23 +; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v74, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v74, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v74, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v74, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v74, 18 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v74, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v74, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v74, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v74, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v74, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v74, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v74, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v74, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v74, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v74, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v74, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v74, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v74, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v74, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v74, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v74, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v74, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v74, 0 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:88 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s30, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s99, 3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s100, 4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s101, 5 +; GFX11-FAKE16-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s102, 6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s103, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s104, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s49, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s50, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s51, 11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s52, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s53, 13 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s54, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s55, 15 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s64, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s65, 17 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s66, 18 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s67, 19 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s68, 20 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s69, 21 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s70, 22 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s71, 23 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s80, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s81, 25 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s82, 26 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s83, 27 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s84, 28 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s85, 29 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s86, 30 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s87, 31 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s27, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s26, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s25, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s23, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s12, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s22, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s21, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s20, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s19, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s17, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s10, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s17, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s16, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s3, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s1, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s0, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s8, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[8:9], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[4:5], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[14:15], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[40:41], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[28:29], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 15 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 13 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s14, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s14, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s40, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-FAKE16-NEXT: .LBB37_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, s27, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v21, s26, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, s25, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, s24, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, s23, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, s22, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, s21, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, s20, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v35, s19, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, s18, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v37, s17, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v36, s16, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, s9, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, s8, 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[21:22] +; GFX11-FAKE16-NEXT: v_add_f32_e64 v53, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v52, s0, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v49, s3, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v48, s2, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, s29, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, s28, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, s41, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, s40, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, s15, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, s14, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, s5, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, s7, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, s11, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, s13, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, s12, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, s10, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, s6, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, s4, 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[28:29] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[30:31] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[34:35] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[36:37] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[32:33], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[48:49] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[52:53] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 24, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 24, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 24, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 8, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 8, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 24, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 24, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 16, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 8, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 24, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 16, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v52 +; GFX11-FAKE16-NEXT: s_branch .LBB37_5 +; GFX11-FAKE16-NEXT: .LBB37_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: s_branch .LBB37_2 +; GFX11-FAKE16-NEXT: .LBB37_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s0 :: v_dual_mov_b32 v53, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v147, s36 :: v_dual_mov_b32 v48, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v49, s3 :: v_dual_mov_b32 v36, s16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, s17 :: v_dual_mov_b32 v148, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s18 :: v_dual_mov_b32 v35, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s20 :: v_dual_mov_b32 v31, s21 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v146, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_mov_b32 v29, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s24 :: v_dual_mov_b32 v24, s25 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v145, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s26 :: v_dual_mov_b32 v22, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v18, s29 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v144, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v14, s41 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s14 :: v_dual_mov_b32 v12, s15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v134, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s4 :: v_dual_mov_b32 v10, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s6 :: v_dual_mov_b32 v8, s7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v135, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s8 :: v_dual_mov_b32 v6, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v133, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s12 :: v_dual_mov_b32 v2, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v74, s35 :: v_dual_mov_b32 v73, s104 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v132, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v72, s34 :: v_dual_mov_b32 v63, s103 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v62, s102 :: v_dual_mov_b32 v61, s101 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v131, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v60, s99 :: v_dual_mov_b32 v59, s100 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v58, s98 :: v_dual_mov_b32 v57, s97 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v129, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v56, s96 :: v_dual_mov_b32 v47, s86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s87 :: v_dual_mov_b32 v45, s85 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v130, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v44, s84 :: v_dual_mov_b32 v43, s83 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v42, s81 :: v_dual_mov_b32 v41, s82 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v128, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v40, s80 :: v_dual_mov_b32 v183, s71 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, s70 :: v_dual_mov_b32 v181, s68 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v119, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, s69 :: v_dual_mov_b32 v179, s67 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, s66 :: v_dual_mov_b32 v177, s65 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v118, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, s55 :: v_dual_mov_b32 v167, s64 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v166, s54 :: v_dual_mov_b32 v165, s53 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v116, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v164, s52 :: v_dual_mov_b32 v163, s50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v162, s51 :: v_dual_mov_b32 v161, s49 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v117, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v160, s48 :: v_dual_mov_b32 v151, s39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v150, s37 :: v_dual_mov_b32 v149, s38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v115, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s62 :: v_dual_mov_b32 v38, s88 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s72 :: v_dual_mov_b32 v50, s90 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v114, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s74 :: v_dual_mov_b32 v54, s92 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s94 :: v_dual_mov_b32 v65, s30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v113, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s60 :: v_dual_mov_b32 v67, s58 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s56 :: v_dual_mov_b32 v69, s46 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v103, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v70, s44 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v80, s42 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v26, s76 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v32, s78 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v112, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 21 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v102, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 22 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v101, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 23 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v100, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v98, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 25 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v99, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v97, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 27 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v96, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 28 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v87, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 29 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v85, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v86, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v84, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v83, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v82, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v51, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v55, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v39, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v33, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v27, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, s0 +; GFX11-FAKE16-NEXT: .LBB37_5: ; %end +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v80 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v62 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v52, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v60, 0xff, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, v80, v71 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v70, v60, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v53, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v60, v52, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, v80, v81 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v61 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, v48, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, v49, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v61, v52, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v80, v81 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xff, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v62, v48, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v71, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v45 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v63, v48, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v37, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, v69, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v36, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, v52, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, v69, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xff, v40 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v183 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v36, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, v52, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, v68, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v53, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v48, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xff, v179 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, v49, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v67, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v166 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v69, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v30, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v66, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[60:63], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[34:37], off offset:16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v163 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xff, v161 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v67, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v35, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v48, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xff, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v148 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, v64, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v49, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v23, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v36, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v17, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v134 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xff, v132 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v52, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v35, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v48, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 8, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xff, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 8, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v49, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, v52, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v13, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v14, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v113 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v103 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 8, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xff, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v32, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v98 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 8, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v49, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v32, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v10, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v5, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v85 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v25, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v25, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v16 +; GFX11-FAKE16-NEXT: s_clause 0x5 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[28:31], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[34:37], off offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:72 +; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v76, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v76, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v76, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v76, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v76, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v76, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v76, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v76, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v76, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v75, 31 +; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v75, 30 +; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v75, 29 +; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v75, 28 +; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v75, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v75, 26 +; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v75, 25 +; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v75, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v75, 23 +; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v75, 22 +; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v75, 21 +; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v75, 20 +; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v75, 19 +; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v75, 18 +; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v75, 17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v75, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v75, 15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v75, 14 +; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v75, 13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v75, 12 +; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v75, 11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v75, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v75, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v75, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v75, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v75, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v75, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v75, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v75, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v75, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v75, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v75, 0 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v128i8_to_v32f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:388 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v60, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v58, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v62, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v61, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v34, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v1 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:372 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v39 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v56 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v47 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v54 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v46 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v37 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v48 -; GCN-NEXT: v_or_b32_e32 v8, v8, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v9, v53 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v10, v42 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v41 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v40 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v63 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v50 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v18, v51 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v19, v49 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v20, v60 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v21, v58 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v22, v22, v62 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_or_b32_e32 v23, v23, v59 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_or_b32_e32 v25, v25, v61 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_or_b32_e32 v26, v26, v34 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v27, v27, v52 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_or_b32_e32 v28, v28, v35 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v33 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v30, v30, v44 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v57 -; GCN-NEXT: v_or_b32_e32 v31, v31, v36 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v36, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v37, v38, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v48, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v55, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v54, v49 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v50 -; GCN-NEXT: v_or_b32_e32 v19, v19, v51 -; GCN-NEXT: v_or_b32_e32 v20, v20, v52 -; GCN-NEXT: v_or_b32_e32 v21, v21, v53 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v23, v23, v33 -; GCN-NEXT: v_or_b32_e32 v24, v24, v34 -; GCN-NEXT: v_or_b32_e32 v25, v25, v35 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v37 -; GCN-NEXT: v_or_b32_e32 v28, v28, v38 -; GCN-NEXT: v_or_b32_e32 v29, v29, v39 -; GCN-NEXT: v_or_b32_e32 v30, v30, v48 -; GCN-NEXT: v_or_b32_e32 v31, v31, v49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB19_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v39, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v56, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v47, v3 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v54, v4 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v46, v5 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v37, v6 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v45, v8 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v53, v9 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v42, v10 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v41, v11 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v40, v12 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v63, v13 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v50, v14 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v0, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v0, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v0, v17 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v51, v18 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v49, v19 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v60, v20 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v58, v21 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v25, v62, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v29, v59, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v37, v32, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v50, v61, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v41, v34, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v45, v52, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v56, v35, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v58, v33, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v59, v44, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v57 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v57, v36, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v60, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v61, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v54, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v32, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v48, v35 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v53, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_mov_b32_e32 v0, v55 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v55, v53 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v55, v40, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v40, v42, v40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v44, v43 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v44, v46, v44 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v0, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v47, v0, v47 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v60, v0 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v61, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v63, v3 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v21 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s7, v37 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s7, v50 -; GCN-NEXT: v_add_i32_e32 v41, vcc, s7, v41 -; GCN-NEXT: v_add_i32_e32 v45, vcc, s7, v45 -; GCN-NEXT: v_add_i32_e32 v56, vcc, s7, v56 -; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v59 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x300, v57 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v56 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 -; GCN-NEXT: v_or_b32_e32 v4, v36, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v39, v6 -; GCN-NEXT: v_or_b32_e32 v7, v49, v7 -; GCN-NEXT: v_or_b32_e32 v8, v51, v8 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: v_or_b32_e32 v10, v54, v10 -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: v_or_b32_e32 v12, v23, v12 -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: v_or_b32_e32 v15, v27, v15 -; GCN-NEXT: v_or_b32_e32 v16, v28, v16 -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: v_or_b32_e32 v18, v31, v18 -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: v_or_b32_e32 v20, v33, v20 -; GCN-NEXT: v_or_b32_e32 v21, v34, v21 -; GCN-NEXT: v_or_b32_e32 v22, v35, v25 -; GCN-NEXT: v_or_b32_e32 v23, v48, v29 -; GCN-NEXT: v_or_b32_e32 v24, v53, v37 -; GCN-NEXT: v_or_b32_e32 v25, v55, v50 -; GCN-NEXT: v_or_b32_e32 v26, v40, v41 -; GCN-NEXT: v_or_b32_e32 v27, v42, v45 -; GCN-NEXT: v_or_b32_e32 v28, v43, v56 -; GCN-NEXT: v_or_b32_e32 v29, v44, v58 -; GCN-NEXT: v_or_b32_e32 v30, v46, v59 -; GCN-NEXT: v_or_b32_e32 v31, v47, v57 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 -; GCN-NEXT: .LBB19_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v128i8_to_v32f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v46 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v40 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v62 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v57 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v44 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v36 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: .LBB38_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v46, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 +; SI-NEXT: .LBB38_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v32f32: ; VI: ; %bb.0: @@ -26044,7 +53112,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload @@ -26517,9 +53585,9 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB19_2: ; %Flow +; VI-NEXT: .LBB38_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_4 +; VI-NEXT: s_cbranch_execz .LBB38_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload @@ -26908,7 +53976,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v31, v32, v31 -; VI-NEXT: .LBB19_4: ; %end +; VI-NEXT: .LBB38_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -27280,7 +54348,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload @@ -27754,9 +54822,9 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: .LBB19_2: ; %Flow +; GFX9-NEXT: .LBB38_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_4 +; GFX9-NEXT: s_cbranch_execz .LBB38_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload @@ -28151,7 +55219,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 -; GFX9-NEXT: .LBB19_4: ; %end +; GFX9-NEXT: .LBB38_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -28393,15 +55461,15 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_4 -; GFX11-TRUE16-NEXT: .LBB19_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_4 +; GFX11-TRUE16-NEXT: .LBB38_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB19_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB38_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h @@ -28558,68 +55626,554 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-TRUE16-NEXT: .LBB38_4: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 @@ -28630,1537 +56184,11002 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB19_2 -; GFX11-TRUE16-NEXT: .LBB19_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:20 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v124 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v125 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v126 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v127 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v111 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v121 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v120 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v122 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v123 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v107 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v108 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v109 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v110 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v106 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v6, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v92 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v78 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v76 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v94 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v95 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v104 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v105 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v79 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v89 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v91 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: .LBB38_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB38_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v55, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v54, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v53, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v52, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v51, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v50, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v124, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v125, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v126, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v127, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v49, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v48, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v37, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v36, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v35, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v39, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v34, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v111, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v120, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v121, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v122, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v107, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v108, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v109, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v38, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v110, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v106, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v7, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v33, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v32, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v92, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v78, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v77, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v76, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v75, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v74, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v60, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v59, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v93, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v94, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v95, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v104, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v105, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v79, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v88, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v89, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v90, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v91, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v58, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v62, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v63, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v72, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v73, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v45, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v46, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v47, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v56, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v57, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-FAKE16-NEXT: .LBB38_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v128i8_to_v32f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB39_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: v_mov_b32_e32 v43, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_mov_b32_e32 v55, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_mov_b32_e32 v44, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 +; SI-NEXT: v_mov_b32_e32 v59, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_mov_b32_e32 v54, v37 +; SI-NEXT: v_mov_b32_e32 v37, v41 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v38, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB39_3 +; SI-NEXT: .LBB39_2: +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_mov_b32_e32 v54, v37 +; SI-NEXT: v_mov_b32_e32 v37, v41 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: .LBB39_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v63, v46 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB39_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB39_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v128i8_to_v32f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v15 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:124 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:172 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:180 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:204 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:240 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:212 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:236 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:252 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:268 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB39_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v44, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v59 +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v53, v63 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v15 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v47, v39 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v52, v48 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v45, v36 +; VI-NEXT: v_mov_b32_e32 v40, v21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v57, v1 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v49, v38 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v23, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 +; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v58, v2 +; VI-NEXT: v_mov_b32_e32 v32, v36 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v46 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v42, v48 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_mov_b32_e32 v61, v39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB39_3 +; VI-NEXT: .LBB39_2: +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v53, v63 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v45, v36 +; VI-NEXT: v_mov_b32_e32 v47, v39 +; VI-NEXT: v_mov_b32_e32 v49, v38 +; VI-NEXT: v_mov_b32_e32 v44, v24 +; VI-NEXT: v_mov_b32_e32 v40, v21 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: v_mov_b32_e32 v43, v59 +; VI-NEXT: v_mov_b32_e32 v52, v48 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: .LBB39_3: ; %Flow +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB39_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v41 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v35 +; VI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 24, v52 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_or_b32_e32 v30, v30, v31 +; VI-NEXT: v_lshlrev_b32_e32 v28, 24, v53 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s9, s17, 8 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s18, 0xff +; VI-NEXT: s_lshl_b32 s8, s19, 24 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s21, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s9, s20, 0xff +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s22, 0xff +; VI-NEXT: s_lshl_b32 s6, s23, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s9 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s5, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v63 +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s8, s8, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v42 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v44 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v40 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v47 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v56 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v61 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v33 +; VI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v48, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v27, 24, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v50 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v43 +; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v33, v51, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v26, 24, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v49, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v50, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v32 +; VI-NEXT: v_or_b32_sdwa v25, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_and_b32_e32 v33, 0xff, v49 +; VI-NEXT: v_or_b32_sdwa v32, v58, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_or_b32_sdwa v26, v26, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v33, 0xff, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_and_b32_e32 v33, 0xff, v39 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_and_b32_e32 v33, 0xff, v37 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v28, v28, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v30, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: .LBB39_5: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v128i8_to_v32f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v4 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(42) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB39_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v38, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v61, v38 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v63, v57 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v37, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v60 +; GFX9-NEXT: v_mov_b32_e32 v52, v56 +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_mov_b32_e32 v34, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v53, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB39_3 +; GFX9-NEXT: .LBB39_2: +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_mov_b32_e32 v63, v57 +; GFX9-NEXT: v_mov_b32_e32 v53, v3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: v_mov_b32_e32 v57, v38 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: .LBB39_3: ; %Flow +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB39_5 +; GFX9-NEXT: ; %bb.4: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v61 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v63 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v46 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB39_5: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:436 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64 +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB39_3 +; GFX11-TRUE16-NEXT: .LBB39_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v22, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v27, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v30, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB39_3: ; %end +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:436 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB39_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB39_2 +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 +; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v53 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v91 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v2, 0xff, v52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB39_3 +; GFX11-FAKE16-NEXT: .LBB39_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v55 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v54 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v52 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v90, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v92, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v93, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v88, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v74, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v75, v12 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v49 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v79, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v63, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v72, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v73, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v117, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v113, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v34, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: .LBB39_3: ; %end +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 +; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB39_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB39_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v32f32_to_v64bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v63 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v32, 1.0, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v31, 1.0, v63 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f32_to_v64bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: .LBB40_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f32_to_v64bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: .LBB40_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f32_to_v64bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: .LBB40_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f32_to_v64bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_writelane_b32 v63, s36, 4 +; SI-NEXT: v_writelane_b32 v63, s37, 5 +; SI-NEXT: v_writelane_b32 v63, s38, 6 +; SI-NEXT: v_writelane_b32 v63, s39, 7 +; SI-NEXT: v_writelane_b32 v63, s48, 8 +; SI-NEXT: v_writelane_b32 v63, s49, 9 +; SI-NEXT: v_writelane_b32 v63, s50, 10 +; SI-NEXT: v_writelane_b32 v63, s51, 11 +; SI-NEXT: v_writelane_b32 v63, s52, 12 +; SI-NEXT: v_writelane_b32 v63, s53, 13 +; SI-NEXT: v_writelane_b32 v63, s54, 14 +; SI-NEXT: v_writelane_b32 v63, s55, 15 +; SI-NEXT: v_writelane_b32 v63, s64, 16 +; SI-NEXT: v_writelane_b32 v63, s65, 17 +; SI-NEXT: v_writelane_b32 v63, s66, 18 +; SI-NEXT: v_writelane_b32 v63, s67, 19 +; SI-NEXT: v_writelane_b32 v63, s68, 20 +; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: v_readfirstlane_b32 s7, v2 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_readfirstlane_b32 s11, v6 +; SI-NEXT: v_readfirstlane_b32 s12, v7 +; SI-NEXT: v_readfirstlane_b32 s13, v8 +; SI-NEXT: v_readfirstlane_b32 s14, v9 +; SI-NEXT: v_readfirstlane_b32 s15, v10 +; SI-NEXT: v_readfirstlane_b32 s40, v11 +; SI-NEXT: v_readfirstlane_b32 s41, v12 +; SI-NEXT: v_readfirstlane_b32 s42, v13 +; SI-NEXT: v_readfirstlane_b32 s43, v14 +; SI-NEXT: v_readfirstlane_b32 s44, v15 +; SI-NEXT: v_readfirstlane_b32 s45, v16 +; SI-NEXT: v_readfirstlane_b32 s46, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s47, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: s_cbranch_scc0 .LBB41_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s47, 0xffff0000 +; SI-NEXT: v_writelane_b32 v62, s4, 3 +; SI-NEXT: s_lshl_b32 s4, s47, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 2 +; SI-NEXT: s_and_b32 s4, s46, 0xffff0000 +; SI-NEXT: v_writelane_b32 v62, s4, 1 +; SI-NEXT: s_lshl_b32 s4, s46, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 0 +; SI-NEXT: s_and_b32 s60, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s45, 16 +; SI-NEXT: s_and_b32 s62, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s44, 16 +; SI-NEXT: s_and_b32 s72, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s73, s43, 16 +; SI-NEXT: s_and_b32 s74, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s42, 16 +; SI-NEXT: s_and_b32 s76, s41, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s41, 16 +; SI-NEXT: s_and_b32 s78, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s79, s40, 16 +; SI-NEXT: s_and_b32 s88, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s89, s15, 16 +; SI-NEXT: s_and_b32 s90, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s91, s14, 16 +; SI-NEXT: s_and_b32 s92, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s93, s13, 16 +; SI-NEXT: s_and_b32 s94, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s95, s12, 16 +; SI-NEXT: s_and_b32 s30, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s31, s11, 16 +; SI-NEXT: s_and_b32 s34, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s35, s10, 16 +; SI-NEXT: s_and_b32 s36, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s37, s9, 16 +; SI-NEXT: s_and_b32 s38, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s39, s8, 16 +; SI-NEXT: s_and_b32 s48, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s49, s7, 16 +; SI-NEXT: s_and_b32 s50, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s51, s6, 16 +; SI-NEXT: s_and_b32 s52, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s53, s29, 16 +; SI-NEXT: s_and_b32 s54, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s55, s28, 16 +; SI-NEXT: s_and_b32 s64, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s65, s27, 16 +; SI-NEXT: s_and_b32 s66, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s67, s26, 16 +; SI-NEXT: s_and_b32 s68, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s69, s25, 16 +; SI-NEXT: s_and_b32 s70, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s71, s24, 16 +; SI-NEXT: s_and_b32 s80, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s81, s23, 16 +; SI-NEXT: s_and_b32 s82, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s83, s22, 16 +; SI-NEXT: s_and_b32 s84, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s85, s21, 16 +; SI-NEXT: s_and_b32 s86, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s87, s20, 16 +; SI-NEXT: s_and_b32 s96, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s97, s19, 16 +; SI-NEXT: s_and_b32 s98, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s99, s18, 16 +; SI-NEXT: s_and_b32 s56, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s17, 16 +; SI-NEXT: s_and_b32 s58, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_4 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s47, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s46, 1.0 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v45, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v43, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v41, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v55, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v53, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v51, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v49, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v39, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v37, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v35, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v33, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v31, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s9, 1.0 +; SI-NEXT: v_add_f32_e64 v27, s10, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s12, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s40, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s41, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s42, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s43, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s44, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s45, 1.0 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_add_f32_e64 v2, s16, 1.0 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v55 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v41 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v43 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v45 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_branch .LBB41_5 +; SI-NEXT: .LBB41_3: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr87 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr81 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: s_branch .LBB41_2 +; SI-NEXT: .LBB41_4: +; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 2 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 3 +; SI-NEXT: v_mov_b32_e32 v2, s59 +; SI-NEXT: v_mov_b32_e32 v3, s58 +; SI-NEXT: v_mov_b32_e32 v61, s57 +; SI-NEXT: v_mov_b32_e32 v1, s56 +; SI-NEXT: v_mov_b32_e32 v59, s99 +; SI-NEXT: v_mov_b32_e32 v60, s98 +; SI-NEXT: v_mov_b32_e32 v57, s97 +; SI-NEXT: v_mov_b32_e32 v58, s96 +; SI-NEXT: v_mov_b32_e32 v47, s87 +; SI-NEXT: v_mov_b32_e32 v56, s86 +; SI-NEXT: v_mov_b32_e32 v45, s85 +; SI-NEXT: v_mov_b32_e32 v46, s84 +; SI-NEXT: v_mov_b32_e32 v43, s83 +; SI-NEXT: v_mov_b32_e32 v44, s82 +; SI-NEXT: v_mov_b32_e32 v41, s81 +; SI-NEXT: v_mov_b32_e32 v42, s80 +; SI-NEXT: v_mov_b32_e32 v55, s71 +; SI-NEXT: v_mov_b32_e32 v40, s70 +; SI-NEXT: v_mov_b32_e32 v53, s69 +; SI-NEXT: v_mov_b32_e32 v54, s68 +; SI-NEXT: v_mov_b32_e32 v51, s67 +; SI-NEXT: v_mov_b32_e32 v52, s66 +; SI-NEXT: v_mov_b32_e32 v49, s65 +; SI-NEXT: v_mov_b32_e32 v50, s64 +; SI-NEXT: v_mov_b32_e32 v39, s55 +; SI-NEXT: v_mov_b32_e32 v48, s54 +; SI-NEXT: v_mov_b32_e32 v37, s53 +; SI-NEXT: v_mov_b32_e32 v38, s52 +; SI-NEXT: v_mov_b32_e32 v35, s51 +; SI-NEXT: v_mov_b32_e32 v36, s50 +; SI-NEXT: v_mov_b32_e32 v33, s49 +; SI-NEXT: v_mov_b32_e32 v34, s48 +; SI-NEXT: v_mov_b32_e32 v31, s39 +; SI-NEXT: v_mov_b32_e32 v32, s38 +; SI-NEXT: v_mov_b32_e32 v29, s37 +; SI-NEXT: v_mov_b32_e32 v30, s36 +; SI-NEXT: v_mov_b32_e32 v27, s35 +; SI-NEXT: v_mov_b32_e32 v28, s34 +; SI-NEXT: v_mov_b32_e32 v25, s31 +; SI-NEXT: v_mov_b32_e32 v26, s30 +; SI-NEXT: v_mov_b32_e32 v23, s95 +; SI-NEXT: v_mov_b32_e32 v24, s94 +; SI-NEXT: v_mov_b32_e32 v21, s93 +; SI-NEXT: v_mov_b32_e32 v22, s92 +; SI-NEXT: v_mov_b32_e32 v19, s91 +; SI-NEXT: v_mov_b32_e32 v20, s90 +; SI-NEXT: v_mov_b32_e32 v17, s89 +; SI-NEXT: v_mov_b32_e32 v18, s88 +; SI-NEXT: v_mov_b32_e32 v15, s79 +; SI-NEXT: v_mov_b32_e32 v16, s78 +; SI-NEXT: v_mov_b32_e32 v13, s77 +; SI-NEXT: v_mov_b32_e32 v14, s76 +; SI-NEXT: v_mov_b32_e32 v11, s75 +; SI-NEXT: v_mov_b32_e32 v12, s74 +; SI-NEXT: v_mov_b32_e32 v9, s73 +; SI-NEXT: v_mov_b32_e32 v10, s72 +; SI-NEXT: v_mov_b32_e32 v7, s63 +; SI-NEXT: v_mov_b32_e32 v8, s62 +; SI-NEXT: v_mov_b32_e32 v5, s61 +; SI-NEXT: v_mov_b32_e32 v6, s60 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: .LBB41_5: ; %end +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v61 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v59 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v47 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v41 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v53 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f32_to_v64bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v32f32_to_v64bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-LABEL: bitcast_v32f32_to_v64bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB41_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_3: +; GFX11-NEXT: .LBB41_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { +; SI-LABEL: bitcast_v64bf16_to_v32f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 +; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16 +; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16 +; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16 +; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16 +; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v32f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: .LBB42_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v32f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 +; GFX9-NEXT: .LBB42_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:20 -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v32.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v32, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v14, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v38, v13, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v12, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v32, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v11, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v33 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v10, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v9, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v8, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v34 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v31, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v31.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v30, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v30 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v31, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v30.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v30, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v29, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v28, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v28, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v28 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v29.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v27, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v29, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v28.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v27, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v27.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v26, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v27, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v26, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v25, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v24, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v26, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v25.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v24, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v24 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v25, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v24.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v24, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v23, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v23.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v21, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v23, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v22.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v21, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v21.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v18 +; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v21, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v19, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v20.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v39, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v19.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v18, 0x7fff +; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v20, v32 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v16 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v19, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v17, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v37, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v16, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v16, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v17.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v18, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v17, v35 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v36, v16 +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB19_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v124 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v125 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v126 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v127 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v111 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v121 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v120 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v122 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v123 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v107 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v38 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v108 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v109 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v110 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v106 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v6, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v92 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v78 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v77 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v76 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v75 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v74 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v60 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v94 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v95 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v104 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v105 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v79 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v89 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v90 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v91 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v72 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v46 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v47 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v56 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v57 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: .LBB19_2: ; %Flow +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB19_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v55, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v54, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v53, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v52, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v51, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v50, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v124, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v125, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v126, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v127, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v49, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v48, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v37, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v36, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v35, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v39, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v34, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v111, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v120, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v121, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v122, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v107, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v108, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v109, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v38, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v110, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v106, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v7, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v33, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v32, 3 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v32, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v39, v14, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v13, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v12, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v11, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v8, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v9, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v6, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v6, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v7, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v92, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v78, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v77, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v76, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v75, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v74, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v60, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v59, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v93, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v94, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v95, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v104, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v105, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v79, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v88, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v89, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v90, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v91, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v58, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v62, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v63, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v72, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v73, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v45, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v46, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v47, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v56, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v57, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 -; GFX11-FAKE16-NEXT: .LBB19_4: ; %end +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v30, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v29 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v30, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v29, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v28, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v27, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v26, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v23, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v21, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v21, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v38, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v19, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v18, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v17, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v17 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v36, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v16, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <128 x i8> %a, splat (i8 3) - %a2 = bitcast <128 x i8> %a1 to <32 x float> + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <32 x float> br label %end cmp.false: - %a3 = bitcast <128 x i8> %a to <32 x float> + %a3 = bitcast <64 x bfloat> %a to <32 x float> br label %end end: @@ -30168,2144 +67187,1217 @@ end: ret <32 x float> %phi } -define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f32_to_v64bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v62 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_add_f32_e32 v32, 1.0, v62 -; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v59 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v56, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v46, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v45, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v44, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 20, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_alignbit_b32 v32, v32, v33, 16 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_alignbit_b32 v34, v34, v35, 16 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_alignbit_b32 v36, v36, v37, 16 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_alignbit_b32 v38, v38, v39, 16 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_alignbit_b32 v48, v48, v49, 16 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_alignbit_b32 v50, v50, v51, 16 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_alignbit_b32 v52, v52, v53, 16 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_alignbit_b32 v54, v54, v55, 16 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_alignbit_b32 v40, v40, v41, 16 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v63 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_alignbit_b32 v42, v42, v43, 16 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v56, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v32f32_to_v64bf16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 -; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: .LBB20_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v32f32_to_v64bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 -; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GFX9-NEXT: .LBB20_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v32f32_to_v64bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 -; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 -; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: .LBB20_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <32 x float> %a1 to <64 x bfloat> - br label %end - -cmp.false: - %a3 = bitcast <32 x float> %a to <64 x bfloat> - br label %end - -end: - %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <64 x bfloat> %phi -} - -define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v64bf16_to_v32f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v33 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v42 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v63 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v0 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; GCN-NEXT: v_alignbit_b32 v0, v0, v32, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v61, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; GCN-NEXT: v_alignbit_b32 v2, v2, v59, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; GCN-NEXT: v_alignbit_b32 v3, v3, v57, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v46 -; GCN-NEXT: v_alignbit_b32 v4, v4, v47, 16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v5, v45, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v33, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v43 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v19, v19, v42, 16 -; GCN-NEXT: v_alignbit_b32 v20, v20, v44, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; GCN-NEXT: v_alignbit_b32 v22, v22, v48, 16 -; GCN-NEXT: v_alignbit_b32 v23, v23, v38, 16 -; GCN-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; GCN-NEXT: v_alignbit_b32 v25, v25, v51, 16 -; GCN-NEXT: v_alignbit_b32 v26, v26, v53, 16 -; GCN-NEXT: v_alignbit_b32 v27, v27, v55, 16 -; GCN-NEXT: v_alignbit_b32 v28, v28, v40, 16 -; GCN-NEXT: v_alignbit_b32 v29, v29, v63, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; kill: killed $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v46 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v34 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v42 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v44 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v63 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v54 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v41 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v43 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v40, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v44, 0x40c00000, v31 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v32 -; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v34 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v36 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v38 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v48 -; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v49 -; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v50 -; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v51 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v52 -; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; GCN-NEXT: v_alignbit_b32 v19, v21, v20, 16 -; GCN-NEXT: v_alignbit_b32 v20, v39, v54, 16 -; GCN-NEXT: v_alignbit_b32 v21, v48, v40, 16 -; GCN-NEXT: v_alignbit_b32 v22, v49, v22, 16 -; GCN-NEXT: v_alignbit_b32 v23, v50, v23, 16 -; GCN-NEXT: v_alignbit_b32 v24, v51, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v32, v25, 16 -; GCN-NEXT: v_alignbit_b32 v26, v33, v26, 16 -; GCN-NEXT: v_alignbit_b32 v27, v34, v27, 16 -; GCN-NEXT: v_alignbit_b32 v28, v35, v28, 16 -; GCN-NEXT: v_alignbit_b32 v29, v36, v29, 16 -; GCN-NEXT: v_alignbit_b32 v30, v37, v30, 16 -; GCN-NEXT: v_alignbit_b32 v31, v38, v31, 16 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64bf16_to_v32f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v14, v11 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: v_mov_b32_e32 v57, v11 +; SI-NEXT: v_mov_b32_e32 v47, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 +; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 +; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 +; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v35, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v11 +; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v56, v11 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v12 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v14 +; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v14 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v53, v38 +; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_mov_b32_e32 v57, v11 +; SI-NEXT: v_mov_b32_e32 v47, v10 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB43_2 ; -; VI-LABEL: bitcast_v64bf16_to_v32f32: +; VI-LABEL: bitcast_v64bf16_to_v32f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: s_movk_i32 s6, 0x7fff ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -32313,1619 +68405,2398 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 -; VI-NEXT: .LBB21_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: s_branch .LBB43_2 ; -; GFX9-LABEL: bitcast_v64bf16_to_v32f32: +; GFX9-LABEL: bitcast_v64bf16_to_v32f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v15 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; GFX9-NEXT: s_mov_b32 s7, 0x7060302 -; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff +; GFX9-NEXT: v_and_b32_sdwa v15, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v33, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v14 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v14, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v14, v33, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v13 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v33, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v12 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v12, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v33, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v11 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v33, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v10 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v10, v18, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v33, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v9 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v33, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v8 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v8, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v33, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v7 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v7, v18, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v33, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v6 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v5 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v5, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v33, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v4 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v33, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v3 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v33, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v2 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v2, v33, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v1 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v33, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v31 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v31 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v31, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v31, v33, 16, v31 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v30 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v30, v18, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v29 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v29 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v29, v18, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v29, v33, 16, v29 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v28 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v28 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v28 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v28, v18, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v28, v33, 16, v28 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v27 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v27 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v27 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v27, v18, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v27, v33, 16, v27 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v26 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v26 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v26 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v26, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v26, v33, 16, v26 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v25 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v25 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v25 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v25, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v25, v33, 16, v25 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v24 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v24 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v24 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v24, v18, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v24, v33, 16, v24 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v23 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v23 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v23 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v23, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v23, v33, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v22 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v22 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v22, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v22, v33, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v21 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v21 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v21 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v21, v18, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v21, v33, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v20 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v20 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v20 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v20, v33, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v19 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v19 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v19, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v19, v33, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v32 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v32 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v32, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v32, v33, 16, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v17, v33, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 -; GFX9-NEXT: .LBB21_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v16, v18, 16, v16 +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: s_branch .LBB43_2 ; -; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32: +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v33, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v32.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v15, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37 -; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v14 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v32, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v14, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v33 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v38, v13, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 +; GFX11-TRUE16-NEXT: s_clause 0x8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v183, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v182, v9 :: v_dual_mov_b32 v169, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v176, v3 :: v_dual_mov_b32 v175, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v173, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s5, s27, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s27, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v12, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v32, v35, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v11, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v11, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v33 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v10, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v10, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v32 -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v9, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v8, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v8, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v34 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v31, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v31.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v30, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v30 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v29 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v31, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v30.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 +; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v23, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v24, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v29, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v30, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v29, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v28, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v28, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v28 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v26, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 +; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v29.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v27, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v29, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v28.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v27, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v27.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 +; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v30, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v26, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v27, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v26, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v25, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v25, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v26.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v24, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v26, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v25.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v24, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v24 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v25, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v24.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v23, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v24, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v23, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v32, v36, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v22, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v22 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v23.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v21, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v23, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v22.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v183 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v33 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v182 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v20 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v21.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v18 -; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v21, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v19, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v19, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v20.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v39, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v183, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v182, v32, 16, v33 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v177 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v177 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v19.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v18, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v20, v32 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v177, v33, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v16 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v19, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v17, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v17, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v16, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v32, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v16, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v17.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v176 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v18, v34 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v48, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v17, v35 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v36, v16 -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32f32: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v14, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v32, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v39, v14, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v15, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v15 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v14 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v13, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v13, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v12, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v11, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v11, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v8, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v8, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v9, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v6, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v6, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v7, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v5 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v31 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v176, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v181 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v36, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v48, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 +; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v50, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v181, v39, 16, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-TRUE16-NEXT: .LBB43_3: ; %end +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v181 :: v_dual_mov_b32 v19, v175 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v21, v176 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 +; GFX11-TRUE16-NEXT: s_clause 0x8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v177 :: v_dual_mov_b32 v27, v182 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v183 :: v_dual_mov_b32 v31, v178 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v179 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 +; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_and_b32 s5, s27, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v31, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 +; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v30, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v29 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v30, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v29, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v28, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v27 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v28, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v27, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v26, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v25 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v26, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v24, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v23 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v23, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v22 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v21, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v21, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v38, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v19, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v19, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v19 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v18, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v18, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v16 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v17, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v17 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v16, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v36, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v16, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-FAKE16-NEXT: .LBB43_3: ; %end +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 +; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -33944,777 +70815,809 @@ end: } define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f32_to_v64f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v62 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v63 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v36 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v38 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v48 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v49 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v50 -; GCN-NEXT: v_mov_b32_e32 v50, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v31, v33 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v31, 1.0, v63 -; GCN-NEXT: v_add_f32_e32 v32, 1.0, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v61 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v59 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v59, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v58 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v56, v2, v1 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v57, v2, v1 -; GCN-NEXT: v_add_i32_e32 v60, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v47 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v2, v1 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v46 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v45 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v44 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v43 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v42 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v41 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v40 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v54 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v53 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v51 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v49 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v48 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v36 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v59, v61, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v60, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f32_to_v64f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v33, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_f32_e32 v42, 1.0, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_add_f32_e32 v44, 1.0, v62 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v2 +; SI-NEXT: v_mov_b32_e32 v52, v29 +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v56, v28 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f32_to_v64f16: ; VI: ; %bb.0: @@ -34726,7 +71629,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -34761,7 +71664,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: .LBB44_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -34776,7 +71679,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -34811,7 +71714,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: .LBB44_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -34828,7 +71731,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 @@ -34847,7 +71750,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 ; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: .LBB44_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -34868,768 +71771,1618 @@ end: ret <64 x half> %phi } +define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f32_to_v64f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_readfirstlane_b32 s47, v1 +; SI-NEXT: v_readfirstlane_b32 s46, v2 +; SI-NEXT: v_readfirstlane_b32 s45, v3 +; SI-NEXT: v_readfirstlane_b32 s44, v4 +; SI-NEXT: v_readfirstlane_b32 s43, v5 +; SI-NEXT: v_readfirstlane_b32 s42, v6 +; SI-NEXT: v_readfirstlane_b32 s41, v7 +; SI-NEXT: v_readfirstlane_b32 s40, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s6, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s44, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s45, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s46, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s47, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e64 v1, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_f32_e64 v22, s40, 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e64 v21, s28, 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e64 v41, s6, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 +; SI-NEXT: v_add_f32_e64 v6, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v27, s46, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s42, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v23 +; SI-NEXT: v_add_f32_e64 v25, s47, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_add_f32_e64 v53, s7, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v57 +; SI-NEXT: v_add_f32_e64 v49, s8, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v53 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s43, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v41 +; SI-NEXT: v_add_f32_e64 v45, s9, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 +; SI-NEXT: v_add_f32_e64 v34, s11, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s45, 1.0 +; SI-NEXT: v_add_f32_e64 v30, s44, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s41, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v37, s10, 1.0 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v37 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v32f32_to_v64f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v32f32_to_v64f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-LABEL: bitcast_v32f32_to_v64f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB45_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_3: +; GFX11-NEXT: .LBB45_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v64f16_to_v32f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v43 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v50 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v37 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v1 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v61 -; GCN-NEXT: v_or_b32_e32 v0, v62, v0 -; GCN-NEXT: v_or_b32_e32 v1, v60, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; GCN-NEXT: v_or_b32_e32 v2, v58, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v57 -; GCN-NEXT: v_or_b32_e32 v3, v56, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v47 -; GCN-NEXT: v_or_b32_e32 v4, v46, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v51 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v44 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v32 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v37 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: v_or_b32_e32 v19, v43, v19 -; GCN-NEXT: v_or_b32_e32 v20, v41, v20 -; GCN-NEXT: v_or_b32_e32 v21, v55, v21 -; GCN-NEXT: v_or_b32_e32 v22, v49, v22 -; GCN-NEXT: v_or_b32_e32 v23, v50, v23 -; GCN-NEXT: v_or_b32_e32 v24, v39, v24 -; GCN-NEXT: v_or_b32_e32 v25, v36, v25 -; GCN-NEXT: v_or_b32_e32 v26, v48, v26 -; GCN-NEXT: v_or_b32_e32 v27, v52, v27 -; GCN-NEXT: v_or_b32_e32 v28, v53, v28 -; GCN-NEXT: v_or_b32_e32 v29, v54, v29 -; GCN-NEXT: v_or_b32_e32 v30, v40, v30 -; GCN-NEXT: v_or_b32_e32 v31, v42, v31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v58 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v56 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v51 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v41 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v55 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v50 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v39 -; GCN-NEXT: v_mov_b32_e32 v39, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v48 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_or_b32_e32 v19, v21, v20 -; GCN-NEXT: v_or_b32_e32 v20, v55, v39 -; GCN-NEXT: v_or_b32_e32 v21, v41, v48 -; GCN-NEXT: v_or_b32_e32 v22, v22, v49 -; GCN-NEXT: v_or_b32_e32 v23, v23, v50 -; GCN-NEXT: v_or_b32_e32 v24, v24, v51 -; GCN-NEXT: v_or_b32_e32 v25, v25, v32 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v38 -; GCN-NEXT: v_or_b32_e32 v28, v28, v33 -; GCN-NEXT: v_or_b32_e32 v29, v29, v34 -; GCN-NEXT: v_or_b32_e32 v30, v30, v35 -; GCN-NEXT: v_or_b32_e32 v31, v31, v37 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64f16_to_v32f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v36 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v58, v2 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_or_b32_e32 v25, v51, v25 +; SI-NEXT: v_or_b32_e32 v26, v48, v26 +; SI-NEXT: v_or_b32_e32 v27, v52, v27 +; SI-NEXT: v_or_b32_e32 v28, v39, v28 +; SI-NEXT: v_or_b32_e32 v29, v37, v29 +; SI-NEXT: v_or_b32_e32 v30, v35, v30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: v_or_b32_e32 v24, v55, v24 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v37 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v52 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v32f32: ; VI: ; %bb.0: @@ -35641,7 +73394,7 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 0x200 ; VI-NEXT: v_add_f16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -35741,7 +73494,7 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 ; VI-NEXT: v_or_b32_e32 v17, v17, v33 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -35756,7 +73509,7 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -35792,7 +73545,7 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: .LBB46_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -35809,7 +73562,7 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -35844,7 +73597,7 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB23_2: ; %end +; GFX11-NEXT: .LBB46_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -35865,369 +73618,1543 @@ end: ret <32 x float> %phi } +define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64f16_to_v32f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v26 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v41, v11 +; SI-NEXT: v_mov_b32_e32 v40, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v37 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v62 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v63 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB47_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_mov_b32_e32 v33, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_or_b32_e32 v22, v51, v22 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v50, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v49, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v48, v25 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v26, v39, v26 +; SI-NEXT: v_mov_b32_e32 v39, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v27, v38, v27 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v28, v37, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_or_b32_e32 v9, v14, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_or_b32_e32 v19, v54, v19 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v2, v60, v2 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_or_b32_e32 v13, v57, v13 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v15, v43, v15 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v40, v17 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_or_b32_e32 v18, v55, v18 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB47_3 +; SI-NEXT: .LBB47_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_mov_b32_e32 v33, v52 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_mov_b32_e32 v39, v27 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB47_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v35, v40 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v40, v46 +; SI-NEXT: v_mov_b32_e32 v41, v56 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v43, v60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB47_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_mov_b32_e32 v55, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: .LBB47_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v32f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB47_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v18, 0x200 +; VI-NEXT: v_add_f16_sdwa v33, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v33 +; VI-NEXT: v_add_f16_sdwa v33, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v33 +; VI-NEXT: v_add_f16_sdwa v33, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v33 +; VI-NEXT: v_add_f16_sdwa v33, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v33 +; VI-NEXT: v_add_f16_sdwa v33, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v33 +; VI-NEXT: v_add_f16_sdwa v33, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v33 +; VI-NEXT: v_add_f16_sdwa v33, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v33 +; VI-NEXT: v_add_f16_sdwa v33, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v33 +; VI-NEXT: v_add_f16_sdwa v33, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v33 +; VI-NEXT: v_add_f16_sdwa v33, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v33 +; VI-NEXT: v_add_f16_sdwa v33, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v33 +; VI-NEXT: v_add_f16_sdwa v33, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v33 +; VI-NEXT: v_add_f16_sdwa v33, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v33 +; VI-NEXT: v_add_f16_sdwa v33, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v33 +; VI-NEXT: v_add_f16_sdwa v33, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v33 +; VI-NEXT: v_add_f16_sdwa v33, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v33 +; VI-NEXT: v_add_f16_sdwa v33, v31, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_or_b32_e32 v31, v31, v33 +; VI-NEXT: v_add_f16_sdwa v33, v30, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_or_b32_e32 v30, v30, v33 +; VI-NEXT: v_add_f16_sdwa v33, v29, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_f16_sdwa v33, v28, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_add_f16_sdwa v33, v27, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_add_f16_sdwa v33, v26, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_add_f16_sdwa v33, v25, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v33 +; VI-NEXT: v_add_f16_sdwa v33, v24, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v33 +; VI-NEXT: v_add_f16_sdwa v33, v23, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v33 +; VI-NEXT: v_add_f16_sdwa v33, v22, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v33 +; VI-NEXT: v_add_f16_sdwa v33, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_or_b32_e32 v21, v21, v33 +; VI-NEXT: v_add_f16_sdwa v33, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_or_b32_e32 v20, v20, v33 +; VI-NEXT: v_add_f16_sdwa v33, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_or_b32_e32 v19, v19, v33 +; VI-NEXT: v_add_f16_sdwa v33, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v32, v32, v33 +; VI-NEXT: v_add_f16_sdwa v33, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v18, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v17, v33 +; VI-NEXT: v_or_b32_e32 v16, v16, v18 +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v64f16_to_v32f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v31, v31, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, v32, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB47_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: s_branch .LBB47_2 +; +; GFX11-LABEL: bitcast_v64f16_to_v32f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB47_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: s_branch .LBB47_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f32_to_v64i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; kill: killed $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; kill: killed $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v50, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v52, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v55, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v41, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_f32_e32 v31, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v50, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v52, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v55, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v41, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v59 -; GCN-NEXT: v_or_b32_e32 v1, v1, v44 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; GCN-NEXT: v_or_b32_e32 v59, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_or_b32_e32 v57, v1, v2 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; GCN-NEXT: v_or_b32_e32 v63, v1, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; GCN-NEXT: v_or_b32_e32 v46, v1, v3 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v43 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v61 -; GCN-NEXT: v_or_b32_e32 v61, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v41 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v60 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v55 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v58 -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v56 -; GCN-NEXT: v_or_b32_e32 v14, v14, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v15, v15, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v47 -; GCN-NEXT: v_or_b32_e32 v16, v16, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v17, v17, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v45 -; GCN-NEXT: v_or_b32_e32 v18, v18, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v19, v19, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v20, v20, v34 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v38 -; GCN-NEXT: v_or_b32_e32 v21, v21, v34 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v42 -; GCN-NEXT: v_or_b32_e32 v22, v22, v34 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v37 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v40 -; GCN-NEXT: v_or_b32_e32 v24, v24, v34 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v36 -; GCN-NEXT: v_or_b32_e32 v25, v25, v34 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v54 -; GCN-NEXT: v_or_b32_e32 v26, v26, v34 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v35 -; GCN-NEXT: v_or_b32_e32 v27, v27, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v53 -; GCN-NEXT: v_or_b32_e32 v28, v28, v34 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v29, v29, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v30, v30, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v32, v32, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v31, v31, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v59, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f32_to_v64i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v52, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v41, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v43, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v52, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v41, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v43, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f32_to_v64i16: ; VI: ; %bb.0: @@ -36239,7 +75166,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -36274,7 +75201,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: .LBB48_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -36289,7 +75216,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -36324,7 +75251,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: .LBB48_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -36341,7 +75268,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 @@ -36360,7 +75287,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 ; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: .LBB48_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -36381,613 +75308,1194 @@ end: ret <64 x i16> %phi } +define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f32_to_v64i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_mov_b32_e32 v36, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v35, s17 +; SI-NEXT: v_mov_b32_e32 v33, s18 +; SI-NEXT: v_mov_b32_e32 v32, s19 +; SI-NEXT: v_mov_b32_e32 v31, s20 +; SI-NEXT: v_mov_b32_e32 v29, s21 +; SI-NEXT: v_mov_b32_e32 v28, s22 +; SI-NEXT: v_mov_b32_e32 v26, s23 +; SI-NEXT: v_mov_b32_e32 v25, s24 +; SI-NEXT: v_mov_b32_e32 v24, s25 +; SI-NEXT: v_mov_b32_e32 v22, s26 +; SI-NEXT: v_mov_b32_e32 v21, s27 +; SI-NEXT: v_mov_b32_e32 v20, s28 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v23, v18, v17, 16 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v27, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v30, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v37, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v52, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v54, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v41, v21, v22, 16 +; SI-NEXT: v_alignbit_b32 v43, v24, v25, 16 +; SI-NEXT: v_alignbit_b32 v45, v26, v28, 16 +; SI-NEXT: v_alignbit_b32 v47, v29, v31, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v32, v33, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v35, v36, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v29 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v35 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v35, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v36, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_alignbit_b32 v23, v18, v17, 16 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v27, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v30, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v37, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v52, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v54, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v41, v21, v22, 16 +; SI-NEXT: v_alignbit_b32 v43, v24, v25, 16 +; SI-NEXT: v_alignbit_b32 v45, v26, v28, 16 +; SI-NEXT: v_alignbit_b32 v47, v29, v31, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v32, v33, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v35, v36, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v29 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v35 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v36, v36, v60 +; SI-NEXT: v_or_b32_e32 v23, v35, v23 +; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v23, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v58 +; SI-NEXT: v_or_b32_e32 v23, v23, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v23, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v63 +; SI-NEXT: v_or_b32_e32 v23, v23, v32 +; SI-NEXT: v_add_i32_e32 v32, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v23, v32, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v47 +; SI-NEXT: v_or_b32_e32 v23, v23, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v23, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v62 +; SI-NEXT: v_or_b32_e32 v23, v23, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v23, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v45 +; SI-NEXT: v_or_b32_e32 v23, v23, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v23, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v61 +; SI-NEXT: v_or_b32_e32 v23, v23, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v43 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v59 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v41 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v57 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v54 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v32f32_to_v64i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v32f32_to_v64i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: s_branch .LBB49_2 +; +; GFX11-LABEL: bitcast_v32f32_to_v64i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB49_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB49_3: +; GFX11-NEXT: .LBB49_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i16_to_v32f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(12) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v24 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v36 -; GCN-NEXT: v_or_b32_e32 v1, v1, v58 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v57 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v60 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v43 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v56 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v62 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v61 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v47 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v44 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v30, v32 -; GCN-NEXT: v_or_b32_e32 v31, v31, v59 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v36, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v58, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v57, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v35, v3 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v61 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v4, v60, v4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v32, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v32, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v32, v15 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v32, v16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v32, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v32, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v32, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v32, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v32, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v32, v30 -; GCN-NEXT: v_or_b32_e32 v31, v59, v31 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(11) -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i16_to_v32f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v23, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v53 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v19, v19, v48 +; SI-NEXT: v_or_b32_e32 v21, v21, v36 +; SI-NEXT: v_or_b32_e32 v22, v22, v34 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v49 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v62 +; SI-NEXT: v_or_b32_e32 v2, v2, v61 +; SI-NEXT: v_or_b32_e32 v3, v3, v60 +; SI-NEXT: v_or_b32_e32 v4, v4, v59 +; SI-NEXT: v_or_b32_e32 v5, v5, v58 +; SI-NEXT: v_or_b32_e32 v6, v6, v57 +; SI-NEXT: v_or_b32_e32 v7, v7, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v47 +; SI-NEXT: v_or_b32_e32 v9, v9, v46 +; SI-NEXT: v_or_b32_e32 v10, v10, v45 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_or_b32_e32 v12, v12, v42 +; SI-NEXT: v_or_b32_e32 v13, v13, v41 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v54 +; SI-NEXT: v_or_b32_e32 v20, v20, v38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: v_or_b32_e32 v21, v36, v21 +; SI-NEXT: v_or_b32_e32 v22, v34, v22 +; SI-NEXT: v_or_b32_e32 v23, v32, v23 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_or_b32_e32 v4, v59, v4 +; SI-NEXT: v_or_b32_e32 v5, v58, v5 +; SI-NEXT: v_or_b32_e32 v6, v57, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v47, v8 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v10, v45, v10 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_or_b32_e32 v13, v41, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v54, v15 +; SI-NEXT: v_or_b32_e32 v18, v49, v18 +; SI-NEXT: v_or_b32_e32 v20, v38, v20 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v51, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v37, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v32f32: ; VI: ; %bb.0: @@ -36999,7 +76507,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v33, 3 ; VI-NEXT: v_add_u16_e32 v32, 3, v15 @@ -37099,7 +76607,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v32, v16, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -37114,7 +76622,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -37149,7 +76657,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: .LBB50_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -37166,7 +76674,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -37201,7 +76709,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB25_2: ; %end +; GFX11-NEXT: .LBB50_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -37222,56 +76730,1108 @@ end: ret <32 x float> %phi } +define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i16_to_v32f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v56, v10 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v10, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v16 +; SI-NEXT: v_or_b32_e32 v16, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v17, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v19, v0, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_or_b32_e32 v20, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v21, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v22, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_or_b32_e32 v23, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 +; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 +; SI-NEXT: v_mov_b32_e32 v26, v27 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_or_b32_e32 v28, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_or_b32_e32 v29, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v30, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_or_b32_e32 v8, v1, v55 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_mov_b32_e32 v56, v16 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v64i16_to_v32f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v32, s30, 0 +; VI-NEXT: v_writelane_b32 v32, s31, 1 +; VI-NEXT: v_writelane_b32 v32, s34, 2 +; VI-NEXT: v_writelane_b32 v32, s35, 3 +; VI-NEXT: v_writelane_b32 v32, s36, 4 +; VI-NEXT: v_writelane_b32 v32, s37, 5 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_writelane_b32 v32, s38, 6 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: v_readfirstlane_b32 s46, v3 +; VI-NEXT: v_readfirstlane_b32 s45, v4 +; VI-NEXT: v_readfirstlane_b32 s44, v5 +; VI-NEXT: v_readfirstlane_b32 s43, v6 +; VI-NEXT: v_readfirstlane_b32 s42, v7 +; VI-NEXT: v_readfirstlane_b32 s41, v8 +; VI-NEXT: v_readfirstlane_b32 s40, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 +; VI-NEXT: v_readfirstlane_b32 s13, v12 +; VI-NEXT: v_readfirstlane_b32 s12, v13 +; VI-NEXT: v_readfirstlane_b32 s11, v14 +; VI-NEXT: v_readfirstlane_b32 s10, v15 +; VI-NEXT: v_readfirstlane_b32 s9, v16 +; VI-NEXT: v_readfirstlane_b32 s8, v17 +; VI-NEXT: v_readfirstlane_b32 s7, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s6, v1 +; VI-NEXT: v_writelane_b32 v32, s39, 7 +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s47, 3 +; VI-NEXT: s_and_b32 s47, s46, 0xffff0000 +; VI-NEXT: s_add_i32 s46, s46, 3 +; VI-NEXT: s_and_b32 s56, s45, 0xffff0000 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_and_b32 s57, s44, 0xffff0000 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_and_b32 s58, s43, 0xffff0000 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_and_b32 s59, s42, 0xffff0000 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_and_b32 s60, s41, 0xffff0000 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_and_b32 s61, s40, 0xffff0000 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_and_b32 s62, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_and_b32 s63, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_and_b32 s72, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_and_b32 s73, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_and_b32 s74, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_and_b32 s75, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_and_b32 s76, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s77, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_and_b32 s78, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s79, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s88, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s89, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s90, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s91, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_and_b32 vcc_lo, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_and_b32 vcc_hi, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_and_b32 s30, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s31, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s34, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s35, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s36, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s37, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s38, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s39, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s40, s40, 0xffff +; VI-NEXT: s_and_b32 s41, s41, 0xffff +; VI-NEXT: s_and_b32 s42, s42, 0xffff +; VI-NEXT: s_and_b32 s43, s43, 0xffff +; VI-NEXT: s_and_b32 s44, s44, 0xffff +; VI-NEXT: s_and_b32 s45, s45, 0xffff +; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s6, s39, s6 +; VI-NEXT: s_or_b32 s7, s38, s7 +; VI-NEXT: s_or_b32 s29, s37, s29 +; VI-NEXT: s_or_b32 s28, s36, s28 +; VI-NEXT: s_or_b32 s27, s35, s27 +; VI-NEXT: s_or_b32 s26, s34, s26 +; VI-NEXT: s_or_b32 s25, s31, s25 +; VI-NEXT: s_or_b32 s24, s30, s24 +; VI-NEXT: s_or_b32 s23, vcc_hi, s23 +; VI-NEXT: s_or_b32 s22, vcc_lo, s22 +; VI-NEXT: s_or_b32 s21, s91, s21 +; VI-NEXT: s_or_b32 s20, s90, s20 +; VI-NEXT: s_or_b32 s19, s89, s19 +; VI-NEXT: s_or_b32 s18, s88, s18 +; VI-NEXT: s_or_b32 s17, s79, s17 +; VI-NEXT: s_or_b32 s16, s78, s16 +; VI-NEXT: s_or_b32 s8, s77, s8 +; VI-NEXT: s_or_b32 s9, s76, s9 +; VI-NEXT: s_or_b32 s10, s75, s10 +; VI-NEXT: s_or_b32 s11, s74, s11 +; VI-NEXT: s_or_b32 s12, s73, s12 +; VI-NEXT: s_or_b32 s13, s72, s13 +; VI-NEXT: s_or_b32 s14, s63, s14 +; VI-NEXT: s_or_b32 s15, s62, s15 +; VI-NEXT: s_or_b32 s40, s61, s40 +; VI-NEXT: s_or_b32 s41, s60, s41 +; VI-NEXT: s_or_b32 s42, s59, s42 +; VI-NEXT: s_or_b32 s43, s58, s43 +; VI-NEXT: s_or_b32 s44, s57, s44 +; VI-NEXT: s_or_b32 s45, s56, s45 +; VI-NEXT: s_or_b32 s46, s47, s46 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_add_i32 s25, s25, 0x30000 +; VI-NEXT: s_add_i32 s24, s24, 0x30000 +; VI-NEXT: s_add_i32 s23, s23, 0x30000 +; VI-NEXT: s_add_i32 s22, s22, 0x30000 +; VI-NEXT: s_add_i32 s21, s21, 0x30000 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s40, s40, 0x30000 +; VI-NEXT: s_add_i32 s41, s41, 0x30000 +; VI-NEXT: s_add_i32 s42, s42, 0x30000 +; VI-NEXT: s_add_i32 s43, s43, 0x30000 +; VI-NEXT: s_add_i32 s44, s44, 0x30000 +; VI-NEXT: s_add_i32 s45, s45, 0x30000 +; VI-NEXT: s_add_i32 s46, s46, 0x30000 +; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s7 +; VI-NEXT: v_mov_b32_e32 v15, s6 +; VI-NEXT: v_mov_b32_e32 v16, s47 +; VI-NEXT: v_mov_b32_e32 v17, s46 +; VI-NEXT: v_mov_b32_e32 v18, s45 +; VI-NEXT: v_mov_b32_e32 v19, s44 +; VI-NEXT: v_mov_b32_e32 v20, s43 +; VI-NEXT: v_mov_b32_e32 v21, s42 +; VI-NEXT: v_mov_b32_e32 v22, s41 +; VI-NEXT: v_mov_b32_e32 v23, s40 +; VI-NEXT: v_mov_b32_e32 v24, s15 +; VI-NEXT: v_mov_b32_e32 v25, s14 +; VI-NEXT: v_mov_b32_e32 v26, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v28, s11 +; VI-NEXT: v_mov_b32_e32 v29, s10 +; VI-NEXT: v_mov_b32_e32 v30, s9 +; VI-NEXT: v_mov_b32_e32 v31, s8 +; VI-NEXT: v_readlane_b32 s39, v32, 7 +; VI-NEXT: v_readlane_b32 s38, v32, 6 +; VI-NEXT: v_readlane_b32 s37, v32, 5 +; VI-NEXT: v_readlane_b32 s36, v32, 4 +; VI-NEXT: v_readlane_b32 s35, v32, 3 +; VI-NEXT: v_readlane_b32 s34, v32, 2 +; VI-NEXT: v_readlane_b32 s31, v32, 1 +; VI-NEXT: v_readlane_b32 s30, v32, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v64i16_to_v32f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-LABEL: bitcast_v64i16_to_v32f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-NEXT: .LBB51_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB51_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB51_4: +; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + define <16 x double> @bitcast_v16i64_to_v16f64(<16 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i64_to_v16f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc -; GCN-NEXT: .LBB26_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i64_to_v16f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: .LBB52_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i64_to_v16f64: ; VI: ; %bb.0: @@ -37283,7 +77843,7 @@ define <16 x double> @bitcast_v16i64_to_v16f64(<16 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -37318,7 +77878,7 @@ define <16 x double> @bitcast_v16i64_to_v16f64(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc -; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: .LBB52_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -37333,7 +77893,7 @@ define <16 x double> @bitcast_v16i64_to_v16f64(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -37368,7 +77928,7 @@ define <16 x double> @bitcast_v16i64_to_v16f64(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc -; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: .LBB52_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -37385,7 +77945,7 @@ define <16 x double> @bitcast_v16i64_to_v16f64(<16 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -37428,7 +77988,7 @@ define <16 x double> @bitcast_v16i64_to_v16f64(<16 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo -; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: .LBB52_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -37449,40 +78009,368 @@ end: ret <16 x double> %phi } +define inreg <16 x double> @bitcast_v16i64_to_v16f64_scalar(<16 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i64_to_v16f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v16i64_to_v16f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v16i64_to_v16f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v32, vcc, 3, v32 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-LABEL: bitcast_v16i64_to_v16f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB53_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_3: +; GFX11-NEXT: .LBB53_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + define <16 x i64> @bitcast_v16f64_to_v16i64(<16 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f64_to_v16i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f64_to_v16i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: .LBB54_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v16i64: ; VI: ; %bb.0: @@ -37494,7 +78382,7 @@ define <16 x i64> @bitcast_v16f64_to_v16i64(<16 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -37513,7 +78401,7 @@ define <16 x i64> @bitcast_v16f64_to_v16i64(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -37528,7 +78416,7 @@ define <16 x i64> @bitcast_v16f64_to_v16i64(<16 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -37547,7 +78435,7 @@ define <16 x i64> @bitcast_v16f64_to_v16i64(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; GFX9-NEXT: .LBB27_2: ; %end +; GFX9-NEXT: .LBB54_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -37564,7 +78452,7 @@ define <16 x i64> @bitcast_v16f64_to_v16i64(<16 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -37583,7 +78471,7 @@ define <16 x i64> @bitcast_v16f64_to_v16i64(<16 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; GFX11-NEXT: .LBB27_2: ; %end +; GFX11-NEXT: .LBB54_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -37604,1227 +78492,1477 @@ end: ret <16 x i64> %phi } +define inreg <16 x i64> @bitcast_v16f64_to_v16i64_scalar(<16 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f64_to_v16i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v33, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: v_mov_b32_e32 v19, v33 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v16f64_to_v16i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v19, v33 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v16f64_to_v16i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: v_mov_b32_e32 v19, v33 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-LABEL: bitcast_v16f64_to_v16i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB55_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: .LBB55_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i64_to_v128i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v57 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v57 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v57 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; GCN-NEXT: v_addc_u32_e32 v57, vcc, 0, v57, vcc -; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v57 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v57 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v57 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v52 -; GCN-NEXT: v_or_b32_e32 v1, v1, v52 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_or_b32_e32 v2, v2, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v50, v31 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v31 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v50 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; GCN-NEXT: v_or_b32_e32 v31, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v61 -; GCN-NEXT: v_or_b32_e32 v49, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v48 -; GCN-NEXT: v_or_b32_e32 v2, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v60 -; GCN-NEXT: v_or_b32_e32 v61, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v39 -; GCN-NEXT: v_or_b32_e32 v62, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v58 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v38 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v37 -; GCN-NEXT: v_or_b32_e32 v7, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v47 -; GCN-NEXT: v_or_b32_e32 v8, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v36 -; GCN-NEXT: v_or_b32_e32 v9, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v46 -; GCN-NEXT: v_or_b32_e32 v10, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v35 -; GCN-NEXT: v_or_b32_e32 v11, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v45 -; GCN-NEXT: v_or_b32_e32 v12, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v34 -; GCN-NEXT: v_or_b32_e32 v13, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v44 -; GCN-NEXT: v_or_b32_e32 v14, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v33 -; GCN-NEXT: v_or_b32_e32 v15, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v43 -; GCN-NEXT: v_or_b32_e32 v16, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v21 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v17, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v42 -; GCN-NEXT: v_or_b32_e32 v18, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v23 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v19, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v41 -; GCN-NEXT: v_or_b32_e32 v20, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v25 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v21, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v40 -; GCN-NEXT: v_or_b32_e32 v22, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v27 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v23, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v55 -; GCN-NEXT: v_or_b32_e32 v24, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v29 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v25, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 -; GCN-NEXT: v_or_b32_e32 v26, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v59 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v27, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v53 -; GCN-NEXT: v_or_b32_e32 v28, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v29, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v32, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v30, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v33, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v34, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v35, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v36, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v37, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v38, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v39, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v48, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v50, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v51, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v52, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v53, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v54, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v55, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v40, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v41, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v42, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v43, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v44, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v45, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v46, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v56, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v57, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v58, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v59, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v60, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v4, v1, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v5, v1, v32 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v63, v2, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v61 -; GCN-NEXT: v_or_b32_e32 v61, v3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v6, v6, v34 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v35 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v36 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v30, v30, v37 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v38 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v39 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v8, v8, v48 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v50 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v11, v11, v52 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v12, v12, v53 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v13, v13, v54 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v55 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v15, v15, v40 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v16, v16, v41 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v17, v17, v42 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v43 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v19, v19, v44 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v45 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v21, v21, v46 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v47 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v23, v23, v56 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v57 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v25, v25, v58 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v59 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v27, v27, v60 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i64_to_v128i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v52, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v41, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v43, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v45, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v52, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v41, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v43, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v45, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v45 +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v45 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v41 +; SI-NEXT: v_or_b32_e32 v41, v41, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v61 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v58 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v47 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v53 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v39 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i64_to_v128i8: ; VI: ; %bb.0: @@ -39025,7 +80163,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill @@ -39201,9 +80339,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 -; VI-NEXT: .LBB28_2: ; %Flow +; VI-NEXT: .LBB56_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_4 +; VI-NEXT: s_cbranch_execz .LBB56_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc @@ -39410,7 +80548,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 -; VI-NEXT: .LBB28_4: ; %end +; VI-NEXT: .LBB56_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v48 ; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -40002,7 +81140,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 @@ -40196,9 +81334,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] -; GFX9-NEXT: .LBB28_2: ; %Flow +; GFX9-NEXT: .LBB56_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_4 +; GFX9-NEXT: s_cbranch_execz .LBB56_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 3, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc @@ -40423,7 +81561,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 -; GFX9-NEXT: .LBB28_4: ; %end +; GFX9-NEXT: .LBB56_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -40871,7 +82009,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] @@ -40938,9 +82076,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 -; GFX11-TRUE16-NEXT: .LBB28_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB56_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v1, vcc_lo, v1, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -41048,7 +82186,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 -; GFX11-TRUE16-NEXT: .LBB28_4: ; %end +; GFX11-TRUE16-NEXT: .LBB56_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -41497,7 +82635,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -41596,9 +82734,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] -; GFX11-FAKE16-NEXT: .LBB28_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB56_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v1, vcc_lo, v1, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -41738,7 +82876,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 -; GFX11-FAKE16-NEXT: .LBB28_4: ; %end +; GFX11-FAKE16-NEXT: .LBB56_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 @@ -42080,1621 +83218,6415 @@ end: ret <128 x i8> %phi } +define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i64_to_v128i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s30, 0 +; SI-NEXT: v_writelane_b32 v41, s31, 1 +; SI-NEXT: v_writelane_b32 v41, s34, 2 +; SI-NEXT: v_writelane_b32 v41, s35, 3 +; SI-NEXT: v_writelane_b32 v41, s36, 4 +; SI-NEXT: v_writelane_b32 v41, s37, 5 +; SI-NEXT: v_writelane_b32 v41, s38, 6 +; SI-NEXT: v_writelane_b32 v41, s39, 7 +; SI-NEXT: v_writelane_b32 v41, s48, 8 +; SI-NEXT: v_writelane_b32 v41, s49, 9 +; SI-NEXT: v_writelane_b32 v41, s50, 10 +; SI-NEXT: v_writelane_b32 v41, s51, 11 +; SI-NEXT: v_writelane_b32 v41, s52, 12 +; SI-NEXT: v_writelane_b32 v41, s53, 13 +; SI-NEXT: v_writelane_b32 v41, s54, 14 +; SI-NEXT: v_writelane_b32 v41, s55, 15 +; SI-NEXT: v_writelane_b32 v41, s64, 16 +; SI-NEXT: v_writelane_b32 v41, s65, 17 +; SI-NEXT: v_writelane_b32 v41, s66, 18 +; SI-NEXT: v_writelane_b32 v41, s67, 19 +; SI-NEXT: v_writelane_b32 v41, s68, 20 +; SI-NEXT: v_writelane_b32 v41, s69, 21 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v41, s70, 22 +; SI-NEXT: v_readfirstlane_b32 s47, v1 +; SI-NEXT: v_readfirstlane_b32 s46, v2 +; SI-NEXT: v_readfirstlane_b32 s45, v3 +; SI-NEXT: v_readfirstlane_b32 s44, v4 +; SI-NEXT: v_readfirstlane_b32 s43, v5 +; SI-NEXT: v_readfirstlane_b32 s42, v6 +; SI-NEXT: v_readfirstlane_b32 s41, v7 +; SI-NEXT: v_readfirstlane_b32 s40, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v16 +; SI-NEXT: v_readfirstlane_b32 s7, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v41, s71, 23 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v22, s45 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v12, s13 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v18, s41 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_alignbit_b32 v24, s44, v22, 24 +; SI-NEXT: v_alignbit_b32 v25, s44, v22, 16 +; SI-NEXT: v_alignbit_b32 v26, s44, v22, 8 +; SI-NEXT: v_mov_b32_e32 v22, s47 +; SI-NEXT: v_mov_b32_e32 v23, s28 +; SI-NEXT: v_mov_b32_e32 v29, s26 +; SI-NEXT: v_mov_b32_e32 v35, s24 +; SI-NEXT: v_mov_b32_e32 v39, s22 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v53, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s8, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s8, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s8, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s10, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s10, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s10, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s12, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s12, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s12, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s14, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s14, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s14, v15, 8 +; SI-NEXT: v_alignbit_b32 v16, s40, v18, 24 +; SI-NEXT: v_alignbit_b32 v17, s40, v18, 16 +; SI-NEXT: v_alignbit_b32 v18, s40, v18, 8 +; SI-NEXT: v_alignbit_b32 v19, s42, v21, 24 +; SI-NEXT: v_alignbit_b32 v20, s42, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s42, v21, 8 +; SI-NEXT: v_alignbit_b32 v30, s46, v22, 24 +; SI-NEXT: v_alignbit_b32 v31, s46, v22, 16 +; SI-NEXT: v_alignbit_b32 v32, s46, v22, 8 +; SI-NEXT: v_alignbit_b32 v36, s29, v23, 24 +; SI-NEXT: v_alignbit_b32 v22, s29, v23, 16 +; SI-NEXT: v_alignbit_b32 v23, s29, v23, 8 +; SI-NEXT: v_alignbit_b32 v27, s27, v29, 24 +; SI-NEXT: v_alignbit_b32 v28, s27, v29, 16 +; SI-NEXT: v_alignbit_b32 v29, s27, v29, 8 +; SI-NEXT: v_alignbit_b32 v33, s25, v35, 24 +; SI-NEXT: v_alignbit_b32 v34, s25, v35, 16 +; SI-NEXT: v_alignbit_b32 v35, s25, v35, 8 +; SI-NEXT: v_alignbit_b32 v37, s23, v39, 24 +; SI-NEXT: v_alignbit_b32 v38, s23, v39, 16 +; SI-NEXT: v_alignbit_b32 v39, s23, v39, 8 +; SI-NEXT: v_alignbit_b32 v48, s21, v50, 24 +; SI-NEXT: v_alignbit_b32 v49, s21, v50, 16 +; SI-NEXT: v_alignbit_b32 v50, s21, v50, 8 +; SI-NEXT: v_alignbit_b32 v51, s19, v53, 24 +; SI-NEXT: v_alignbit_b32 v52, s19, v53, 16 +; SI-NEXT: v_alignbit_b32 v53, s19, v53, 8 +; SI-NEXT: v_alignbit_b32 v54, s17, v40, 24 +; SI-NEXT: v_alignbit_b32 v55, s17, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, s17, v40, 8 +; SI-NEXT: s_lshr_b32 s56, s6, 24 +; SI-NEXT: s_lshr_b32 s57, s6, 16 +; SI-NEXT: s_lshr_b32 s58, s6, 8 +; SI-NEXT: s_lshr_b32 s59, s8, 24 +; SI-NEXT: s_lshr_b32 s60, s8, 16 +; SI-NEXT: s_lshr_b32 s61, s8, 8 +; SI-NEXT: s_lshr_b32 s62, s10, 24 +; SI-NEXT: s_lshr_b32 s63, s10, 16 +; SI-NEXT: s_lshr_b32 s72, s10, 8 +; SI-NEXT: s_lshr_b32 s73, s12, 24 +; SI-NEXT: s_lshr_b32 s74, s12, 16 +; SI-NEXT: s_lshr_b32 s75, s12, 8 +; SI-NEXT: s_lshr_b32 s76, s14, 24 +; SI-NEXT: s_lshr_b32 s77, s14, 16 +; SI-NEXT: s_lshr_b32 s78, s14, 8 +; SI-NEXT: s_lshr_b32 s79, s40, 24 +; SI-NEXT: s_lshr_b32 s88, s40, 16 +; SI-NEXT: s_lshr_b32 s89, s40, 8 +; SI-NEXT: s_lshr_b32 s90, s42, 24 +; SI-NEXT: s_lshr_b32 s91, s42, 16 +; SI-NEXT: s_lshr_b32 s92, s42, 8 +; SI-NEXT: s_lshr_b32 s93, s44, 24 +; SI-NEXT: s_lshr_b32 s94, s44, 16 +; SI-NEXT: s_lshr_b32 s95, s44, 8 +; SI-NEXT: s_lshr_b32 s30, s46, 24 +; SI-NEXT: s_lshr_b32 s31, s46, 16 +; SI-NEXT: s_lshr_b32 s34, s46, 8 +; SI-NEXT: s_lshr_b32 s35, s29, 24 +; SI-NEXT: s_lshr_b32 s36, s29, 16 +; SI-NEXT: s_lshr_b32 s37, s29, 8 +; SI-NEXT: s_lshr_b32 s38, s27, 24 +; SI-NEXT: s_lshr_b32 s39, s27, 16 +; SI-NEXT: s_lshr_b32 s48, s27, 8 +; SI-NEXT: s_lshr_b32 s49, s25, 24 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s25, 8 +; SI-NEXT: s_lshr_b32 s52, s23, 24 +; SI-NEXT: s_lshr_b32 s53, s23, 16 +; SI-NEXT: s_lshr_b32 s54, s23, 8 +; SI-NEXT: s_lshr_b32 s55, s21, 24 +; SI-NEXT: s_lshr_b32 s64, s21, 16 +; SI-NEXT: s_lshr_b32 s65, s21, 8 +; SI-NEXT: s_lshr_b32 s66, s19, 24 +; SI-NEXT: s_lshr_b32 s67, s19, 16 +; SI-NEXT: s_lshr_b32 s68, s19, 8 +; SI-NEXT: s_lshr_b32 s69, s17, 24 +; SI-NEXT: s_lshr_b32 s70, s17, 16 +; SI-NEXT: s_lshr_b32 s71, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s47, s47, 3 +; SI-NEXT: s_addc_u32 s46, s46, 0 +; SI-NEXT: s_add_u32 s45, s45, 3 +; SI-NEXT: s_addc_u32 s44, s44, 0 +; SI-NEXT: s_add_u32 s43, s43, 3 +; SI-NEXT: s_addc_u32 s42, s42, 0 +; SI-NEXT: s_add_u32 s41, s41, 3 +; SI-NEXT: s_addc_u32 s40, s40, 0 +; SI-NEXT: s_add_u32 s15, s15, 3 +; SI-NEXT: s_addc_u32 s14, s14, 0 +; SI-NEXT: s_add_u32 s13, s13, 3 +; SI-NEXT: s_addc_u32 s12, s12, 0 +; SI-NEXT: s_add_u32 s11, s11, 3 +; SI-NEXT: s_addc_u32 s10, s10, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: v_mov_b32_e32 v22, s45 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v12, s13 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v18, s41 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_alignbit_b32 v24, s44, v22, 24 +; SI-NEXT: v_alignbit_b32 v25, s44, v22, 16 +; SI-NEXT: v_alignbit_b32 v26, s44, v22, 8 +; SI-NEXT: v_mov_b32_e32 v22, s47 +; SI-NEXT: v_mov_b32_e32 v23, s28 +; SI-NEXT: v_mov_b32_e32 v29, s26 +; SI-NEXT: v_mov_b32_e32 v35, s24 +; SI-NEXT: v_mov_b32_e32 v39, s22 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v53, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s8, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s8, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s8, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s10, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s10, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s10, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s12, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s12, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s12, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s14, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s14, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s14, v15, 8 +; SI-NEXT: v_alignbit_b32 v16, s40, v18, 24 +; SI-NEXT: v_alignbit_b32 v17, s40, v18, 16 +; SI-NEXT: v_alignbit_b32 v18, s40, v18, 8 +; SI-NEXT: v_alignbit_b32 v19, s42, v21, 24 +; SI-NEXT: v_alignbit_b32 v20, s42, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s42, v21, 8 +; SI-NEXT: v_alignbit_b32 v30, s46, v22, 24 +; SI-NEXT: v_alignbit_b32 v31, s46, v22, 16 +; SI-NEXT: v_alignbit_b32 v32, s46, v22, 8 +; SI-NEXT: v_alignbit_b32 v36, s29, v23, 24 +; SI-NEXT: v_alignbit_b32 v22, s29, v23, 16 +; SI-NEXT: v_alignbit_b32 v23, s29, v23, 8 +; SI-NEXT: v_alignbit_b32 v27, s27, v29, 24 +; SI-NEXT: v_alignbit_b32 v28, s27, v29, 16 +; SI-NEXT: v_alignbit_b32 v29, s27, v29, 8 +; SI-NEXT: v_alignbit_b32 v33, s25, v35, 24 +; SI-NEXT: v_alignbit_b32 v34, s25, v35, 16 +; SI-NEXT: v_alignbit_b32 v35, s25, v35, 8 +; SI-NEXT: v_alignbit_b32 v37, s23, v39, 24 +; SI-NEXT: v_alignbit_b32 v38, s23, v39, 16 +; SI-NEXT: v_alignbit_b32 v39, s23, v39, 8 +; SI-NEXT: v_alignbit_b32 v48, s21, v50, 24 +; SI-NEXT: v_alignbit_b32 v49, s21, v50, 16 +; SI-NEXT: v_alignbit_b32 v50, s21, v50, 8 +; SI-NEXT: v_alignbit_b32 v51, s19, v53, 24 +; SI-NEXT: v_alignbit_b32 v52, s19, v53, 16 +; SI-NEXT: v_alignbit_b32 v53, s19, v53, 8 +; SI-NEXT: v_alignbit_b32 v54, s17, v40, 24 +; SI-NEXT: v_alignbit_b32 v55, s17, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, s17, v40, 8 +; SI-NEXT: s_lshr_b32 s56, s6, 24 +; SI-NEXT: s_lshr_b32 s57, s6, 16 +; SI-NEXT: s_lshr_b32 s58, s6, 8 +; SI-NEXT: s_lshr_b32 s59, s8, 24 +; SI-NEXT: s_lshr_b32 s60, s8, 16 +; SI-NEXT: s_lshr_b32 s61, s8, 8 +; SI-NEXT: s_lshr_b32 s62, s10, 24 +; SI-NEXT: s_lshr_b32 s63, s10, 16 +; SI-NEXT: s_lshr_b32 s72, s10, 8 +; SI-NEXT: s_lshr_b32 s73, s12, 24 +; SI-NEXT: s_lshr_b32 s74, s12, 16 +; SI-NEXT: s_lshr_b32 s75, s12, 8 +; SI-NEXT: s_lshr_b32 s76, s14, 24 +; SI-NEXT: s_lshr_b32 s77, s14, 16 +; SI-NEXT: s_lshr_b32 s78, s14, 8 +; SI-NEXT: s_lshr_b32 s79, s40, 24 +; SI-NEXT: s_lshr_b32 s88, s40, 16 +; SI-NEXT: s_lshr_b32 s89, s40, 8 +; SI-NEXT: s_lshr_b32 s90, s42, 24 +; SI-NEXT: s_lshr_b32 s91, s42, 16 +; SI-NEXT: s_lshr_b32 s92, s42, 8 +; SI-NEXT: s_lshr_b32 s93, s44, 24 +; SI-NEXT: s_lshr_b32 s94, s44, 16 +; SI-NEXT: s_lshr_b32 s95, s44, 8 +; SI-NEXT: s_lshr_b32 s30, s46, 24 +; SI-NEXT: s_lshr_b32 s31, s46, 16 +; SI-NEXT: s_lshr_b32 s34, s46, 8 +; SI-NEXT: s_lshr_b32 s35, s29, 24 +; SI-NEXT: s_lshr_b32 s36, s29, 16 +; SI-NEXT: s_lshr_b32 s37, s29, 8 +; SI-NEXT: s_lshr_b32 s38, s27, 24 +; SI-NEXT: s_lshr_b32 s39, s27, 16 +; SI-NEXT: s_lshr_b32 s48, s27, 8 +; SI-NEXT: s_lshr_b32 s49, s25, 24 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s25, 8 +; SI-NEXT: s_lshr_b32 s52, s23, 24 +; SI-NEXT: s_lshr_b32 s53, s23, 16 +; SI-NEXT: s_lshr_b32 s54, s23, 8 +; SI-NEXT: s_lshr_b32 s55, s21, 24 +; SI-NEXT: s_lshr_b32 s64, s21, 16 +; SI-NEXT: s_lshr_b32 s65, s21, 8 +; SI-NEXT: s_lshr_b32 s66, s19, 24 +; SI-NEXT: s_lshr_b32 s67, s19, 16 +; SI-NEXT: s_lshr_b32 s68, s19, 8 +; SI-NEXT: s_lshr_b32 s69, s17, 24 +; SI-NEXT: s_lshr_b32 s70, s17, 16 +; SI-NEXT: s_lshr_b32 s71, s17, 8 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; SI-NEXT: v_or_b32_e32 v40, s4, v40 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s71, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s70, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s69, 24 +; SI-NEXT: v_and_b32_e32 v55, 0xff, v55 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v54 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_or_b32_e32 v54, v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v53 +; SI-NEXT: v_or_b32_e32 v53, s4, v53 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s68, 8 +; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s67, 0xff +; SI-NEXT: v_and_b32_e32 v40, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v51 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s66, 24 +; SI-NEXT: v_or_b32_e32 v54, v40, v54 +; SI-NEXT: v_and_b32_e32 v53, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v54, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v51, v53, v51 +; SI-NEXT: v_add_i32_e32 v52, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v55, v54, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v51, v52, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v52, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v50 +; SI-NEXT: v_or_b32_e32 v50, s4, v50 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s64, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v48 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s55, 24 +; SI-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v48, v48, v49 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v48, v50, v48 +; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v52, v51, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v48, v49, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v49, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; SI-NEXT: v_or_b32_e32 v39, s4, v39 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s5, s54, 8 +; SI-NEXT: v_and_b32_e32 v38, 0xff, v38 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s53, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v37 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s52, 24 +; SI-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v48, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v37, v39, v37 +; SI-NEXT: v_add_i32_e32 v38, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v49, v48, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v37, v38, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v38, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; SI-NEXT: v_or_b32_e32 v35, s4, v35 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s5, s51, 8 +; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s50, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s49, 24 +; SI-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v33, v35, v33 +; SI-NEXT: v_add_i32_e32 v34, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v38, v37, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v34, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; SI-NEXT: v_or_b32_e32 v29, s4, v29 +; SI-NEXT: s_and_b32 s4, s27, 0xff +; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s39, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s38, 24 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v34, v33, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_or_b32_e32 v23, s4, v23 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s37, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s36, 0xff +; SI-NEXT: buffer_store_dword v28, v27, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v36 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s35, 24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v22, v27, v22 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v32 +; SI-NEXT: v_or_b32_e32 v22, s4, v22 +; SI-NEXT: s_and_b32 s4, s46, 0xff +; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v31 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s31, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v30 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s30, 24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v23, v27, v23 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 60, v0 +; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v26 +; SI-NEXT: v_or_b32_e32 v22, s4, v22 +; SI-NEXT: s_and_b32 s4, s44, 0xff +; SI-NEXT: s_lshl_b32 s5, s95, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v25 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s94, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s93, 24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; SI-NEXT: v_or_b32_e32 v21, s4, v21 +; SI-NEXT: s_and_b32 s4, s42, 0xff +; SI-NEXT: s_lshl_b32 s5, s92, 8 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s91, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s90, 24 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v20, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_or_b32_e32 v18, s4, v18 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s5, s89, 8 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s79, 24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s77, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s14, s76, 24 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s74, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s12, s73, 24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s63, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s10, s62, 24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s10, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x64, v0 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s59, 24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s57, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s56, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x74, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s71, v41, 23 +; SI-NEXT: v_readlane_b32 s70, v41, 22 +; SI-NEXT: v_readlane_b32 s69, v41, 21 +; SI-NEXT: v_readlane_b32 s68, v41, 20 +; SI-NEXT: v_readlane_b32 s67, v41, 19 +; SI-NEXT: v_readlane_b32 s66, v41, 18 +; SI-NEXT: v_readlane_b32 s65, v41, 17 +; SI-NEXT: v_readlane_b32 s64, v41, 16 +; SI-NEXT: v_readlane_b32 s55, v41, 15 +; SI-NEXT: v_readlane_b32 s54, v41, 14 +; SI-NEXT: v_readlane_b32 s53, v41, 13 +; SI-NEXT: v_readlane_b32 s52, v41, 12 +; SI-NEXT: v_readlane_b32 s51, v41, 11 +; SI-NEXT: v_readlane_b32 s50, v41, 10 +; SI-NEXT: v_readlane_b32 s49, v41, 9 +; SI-NEXT: v_readlane_b32 s48, v41, 8 +; SI-NEXT: v_readlane_b32 s39, v41, 7 +; SI-NEXT: v_readlane_b32 s38, v41, 6 +; SI-NEXT: v_readlane_b32 s37, v41, 5 +; SI-NEXT: v_readlane_b32 s36, v41, 4 +; SI-NEXT: v_readlane_b32 s35, v41, 3 +; SI-NEXT: v_readlane_b32 s34, v41, 2 +; SI-NEXT: v_readlane_b32 s31, v41, 1 +; SI-NEXT: v_readlane_b32 s30, v41, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v16i64_to_v128i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s34, 2 +; VI-NEXT: v_writelane_b32 v20, s35, 3 +; VI-NEXT: v_writelane_b32 v20, s36, 4 +; VI-NEXT: v_writelane_b32 v20, s37, 5 +; VI-NEXT: v_writelane_b32 v20, s38, 6 +; VI-NEXT: v_writelane_b32 v20, s39, 7 +; VI-NEXT: v_writelane_b32 v20, s48, 8 +; VI-NEXT: v_writelane_b32 v20, s49, 9 +; VI-NEXT: v_writelane_b32 v20, s50, 10 +; VI-NEXT: v_writelane_b32 v20, s51, 11 +; VI-NEXT: v_writelane_b32 v20, s52, 12 +; VI-NEXT: v_writelane_b32 v20, s53, 13 +; VI-NEXT: v_writelane_b32 v20, s54, 14 +; VI-NEXT: v_writelane_b32 v20, s55, 15 +; VI-NEXT: v_writelane_b32 v20, s64, 16 +; VI-NEXT: v_writelane_b32 v20, s65, 17 +; VI-NEXT: v_writelane_b32 v20, s66, 18 +; VI-NEXT: v_writelane_b32 v20, s67, 19 +; VI-NEXT: v_writelane_b32 v20, s68, 20 +; VI-NEXT: v_writelane_b32 v20, s69, 21 +; VI-NEXT: v_writelane_b32 v20, s70, 22 +; VI-NEXT: v_writelane_b32 v20, s71, 23 +; VI-NEXT: v_writelane_b32 v20, s80, 24 +; VI-NEXT: v_writelane_b32 v20, s81, 25 +; VI-NEXT: v_writelane_b32 v20, s82, 26 +; VI-NEXT: v_writelane_b32 v20, s83, 27 +; VI-NEXT: v_writelane_b32 v20, s84, 28 +; VI-NEXT: v_writelane_b32 v20, s85, 29 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_writelane_b32 v20, s86, 30 +; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: v_readfirstlane_b32 s45, v2 +; VI-NEXT: v_readfirstlane_b32 s42, v3 +; VI-NEXT: v_readfirstlane_b32 s43, v4 +; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s41, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s15, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s11, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s9, v14 +; VI-NEXT: v_readfirstlane_b32 s6, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v16 +; VI-NEXT: v_readfirstlane_b32 s4, v17 +; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v18 +; VI-NEXT: v_writelane_b32 v20, s87, 31 +; VI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s12, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s41, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s41, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s41, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s40, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s40, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s43, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s43, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s43, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s42, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s42, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s44, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 57 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 58 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 59 +; VI-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 6 +; VI-NEXT: v_writelane_b32 v21, s61, 7 +; VI-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 4 +; VI-NEXT: v_writelane_b32 v21, s61, 5 +; VI-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 2 +; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 0 +; VI-NEXT: s_lshr_b32 s66, s27, 8 +; VI-NEXT: s_lshr_b32 s67, s26, 16 +; VI-NEXT: s_lshr_b32 s68, s26, 8 +; VI-NEXT: s_lshr_b32 s69, s25, 24 +; VI-NEXT: s_lshr_b32 s70, s25, 16 +; VI-NEXT: s_lshr_b32 s71, s25, 8 +; VI-NEXT: s_lshr_b32 s80, s24, 16 +; VI-NEXT: s_lshr_b32 s81, s24, 8 +; VI-NEXT: s_lshr_b32 s82, s23, 24 +; VI-NEXT: s_lshr_b32 s83, s23, 16 +; VI-NEXT: s_lshr_b32 s84, s23, 8 +; VI-NEXT: s_lshr_b32 s85, s22, 16 +; VI-NEXT: s_lshr_b32 s86, s22, 8 +; VI-NEXT: s_lshr_b32 s87, s21, 24 +; VI-NEXT: s_lshr_b32 s50, s21, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 8 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s20, 8 +; VI-NEXT: s_lshr_b32 s57, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s58, s17, 24 +; VI-NEXT: s_lshr_b32 s59, s17, 16 +; VI-NEXT: s_lshr_b32 s55, s17, 8 +; VI-NEXT: s_lshr_b32 s64, s16, 16 +; VI-NEXT: s_lshr_b32 s65, s16, 8 +; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s44, s44, 3 +; VI-NEXT: s_addc_u32 s45, s45, 0 +; VI-NEXT: s_add_u32 s42, s42, 3 +; VI-NEXT: s_addc_u32 s43, s43, 0 +; VI-NEXT: s_add_u32 s40, s40, 3 +; VI-NEXT: s_addc_u32 s41, s41, 0 +; VI-NEXT: s_add_u32 s14, s14, 3 +; VI-NEXT: s_addc_u32 s15, s15, 0 +; VI-NEXT: s_add_u32 s12, s12, 3 +; VI-NEXT: s_addc_u32 s13, s13, 0 +; VI-NEXT: s_add_u32 s10, s10, 3 +; VI-NEXT: s_addc_u32 s11, s11, 0 +; VI-NEXT: s_add_u32 s8, s8, 3 +; VI-NEXT: s_addc_u32 s9, s9, 0 +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s4, s4, 3 +; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s12, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s41, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s41, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s41, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s40, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s40, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s43, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s43, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s43, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s42, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s42, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s44, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 57 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 58 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 59 +; VI-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 6 +; VI-NEXT: v_writelane_b32 v21, s61, 7 +; VI-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 4 +; VI-NEXT: v_writelane_b32 v21, s61, 5 +; VI-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 2 +; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: s_lshr_b32 s66, s27, 8 +; VI-NEXT: s_lshr_b32 s67, s26, 16 +; VI-NEXT: s_lshr_b32 s68, s26, 8 +; VI-NEXT: s_lshr_b32 s69, s25, 24 +; VI-NEXT: s_lshr_b32 s70, s25, 16 +; VI-NEXT: s_lshr_b32 s71, s25, 8 +; VI-NEXT: s_lshr_b32 s80, s24, 16 +; VI-NEXT: s_lshr_b32 s81, s24, 8 +; VI-NEXT: s_lshr_b32 s82, s23, 24 +; VI-NEXT: s_lshr_b32 s83, s23, 16 +; VI-NEXT: s_lshr_b32 s84, s23, 8 +; VI-NEXT: s_lshr_b32 s85, s22, 16 +; VI-NEXT: s_lshr_b32 s86, s22, 8 +; VI-NEXT: s_lshr_b32 s87, s21, 24 +; VI-NEXT: s_lshr_b32 s50, s21, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 8 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s20, 8 +; VI-NEXT: s_lshr_b32 s57, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s58, s17, 24 +; VI-NEXT: s_lshr_b32 s59, s17, 16 +; VI-NEXT: s_lshr_b32 s55, s17, 8 +; VI-NEXT: s_lshr_b32 s64, s16, 16 +; VI-NEXT: s_lshr_b32 s65, s16, 8 +; VI-NEXT: v_writelane_b32 v21, s60, 0 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: s_lshl_b32 s61, s65, 8 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_or_b32 s16, s16, s61 +; VI-NEXT: s_lshl_b32 s61, s48, 8 +; VI-NEXT: s_and_b32 s63, s64, 0xff +; VI-NEXT: s_or_b32 s61, s63, s61 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s61, s61, 16 +; VI-NEXT: s_or_b32 s16, s16, s61 +; VI-NEXT: v_mov_b32_e32 v1, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xff +; VI-NEXT: s_lshl_b32 s17, s55, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s59, 0xff +; VI-NEXT: s_lshl_b32 s58, s58, 8 +; VI-NEXT: s_or_b32 s17, s17, s58 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_lshl_b32 s16, s54, 8 +; VI-NEXT: s_and_b32 s17, s18, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s38, 8 +; VI-NEXT: s_and_b32 s18, s53, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xff +; VI-NEXT: s_lshl_b32 s17, s52, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s51, 0xff +; VI-NEXT: s_lshl_b32 s18, s57, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: s_lshl_b32 s16, s56, 8 +; VI-NEXT: s_and_b32 s17, s20, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s36, 8 +; VI-NEXT: s_and_b32 s18, s47, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v5, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xff +; VI-NEXT: s_lshl_b32 s17, s46, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s50, 0xff +; VI-NEXT: s_lshl_b32 s18, s87, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v6, s16 +; VI-NEXT: s_lshl_b32 s16, s86, 8 +; VI-NEXT: s_and_b32 s17, s22, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s34, 8 +; VI-NEXT: s_and_b32 s18, s85, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v7, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xff +; VI-NEXT: s_lshl_b32 s17, s84, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s83, 0xff +; VI-NEXT: s_lshl_b32 s18, s82, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v8, s16 +; VI-NEXT: s_lshl_b32 s16, s81, 8 +; VI-NEXT: s_and_b32 s17, s24, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s30, 8 +; VI-NEXT: s_and_b32 s18, s80, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xff +; VI-NEXT: s_lshl_b32 s17, s71, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s70, 0xff +; VI-NEXT: s_lshl_b32 s18, s69, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: s_lshl_b32 s16, s68, 8 +; VI-NEXT: s_and_b32 s17, s26, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s90, 8 +; VI-NEXT: s_and_b32 s18, s67, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v11, s16 +; VI-NEXT: s_and_b32 s16, s27, 0xff +; VI-NEXT: s_lshl_b32 s17, s66, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 59 +; VI-NEXT: v_readlane_b32 s18, v21, 58 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v12, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 57 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s28, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 56 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s88, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 55 +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: s_and_b32 s16, s29, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 54 +; VI-NEXT: v_readlane_b32 s18, v21, 53 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v14, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 52 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s44, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 51 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s78, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 50 +; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: v_mov_b32_e32 v15, s16 +; VI-NEXT: s_and_b32 s16, s45, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: v_readlane_b32 s17, v21, 49 +; VI-NEXT: v_readlane_b32 s18, v21, 48 +; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 47 +; VI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s42, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 46 +; VI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s76, 8 +; VI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 45 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s43, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 44 +; VI-NEXT: v_readlane_b32 s18, v21, 43 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 64, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 42 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s40, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 41 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s74, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 40 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s41, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 39 +; VI-NEXT: v_readlane_b32 s18, v21, 38 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x48, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 37 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 36 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_lshl_b32 s17, s72, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x4c, v0 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s14, s15, 0xff +; VI-NEXT: v_readlane_b32 s15, v21, 35 +; VI-NEXT: s_lshl_b32 s15, s15, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: v_readlane_b32 s15, v21, 34 +; VI-NEXT: v_readlane_b32 s16, v21, 33 +; VI-NEXT: s_and_b32 s15, s15, 0xff +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x50, v0 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_readlane_b32 s14, v21, 32 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s14, s14, 8 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: v_readlane_b32 s14, v21, 31 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s15, s62, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x54, v0 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s12, s13, 0xff +; VI-NEXT: v_readlane_b32 s13, v21, 30 +; VI-NEXT: s_lshl_b32 s13, s13, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: v_readlane_b32 s13, v21, 29 +; VI-NEXT: v_readlane_b32 s14, v21, 28 +; VI-NEXT: s_and_b32 s13, s13, 0xff +; VI-NEXT: s_lshl_b32 s14, s14, 8 +; VI-NEXT: s_or_b32 s13, s13, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x58, v0 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: v_readlane_b32 s12, v21, 27 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s12, s12, 8 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: v_readlane_b32 s12, v21, 26 +; VI-NEXT: v_readlane_b32 s14, v21, 0 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s13, s14, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v0 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s10, s11, 0xff +; VI-NEXT: v_readlane_b32 s11, v21, 25 +; VI-NEXT: s_lshl_b32 s11, s11, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: v_readlane_b32 s11, v21, 24 +; VI-NEXT: v_readlane_b32 s12, v21, 23 +; VI-NEXT: s_and_b32 s11, s11, 0xff +; VI-NEXT: s_lshl_b32 s12, s12, 8 +; VI-NEXT: s_or_b32 s11, s11, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x60, v0 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_readlane_b32 s10, v21, 22 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: v_readlane_b32 s10, v21, 21 +; VI-NEXT: v_readlane_b32 s12, v21, 2 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s11, s12, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x64, v0 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s8, s9, 0xff +; VI-NEXT: v_readlane_b32 s9, v21, 20 +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: v_readlane_b32 s9, v21, 19 +; VI-NEXT: v_readlane_b32 s10, v21, 18 +; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x68, v0 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_readlane_b32 s8, v21, 17 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_readlane_b32 s8, v21, 16 +; VI-NEXT: v_readlane_b32 s10, v21, 4 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v0 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_and_b32 s6, s7, 0xff +; VI-NEXT: v_readlane_b32 s7, v21, 15 +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_readlane_b32 s7, v21, 14 +; VI-NEXT: v_readlane_b32 s8, v21, 13 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x70, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_readlane_b32 s6, v21, 12 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_readlane_b32 s6, v21, 11 +; VI-NEXT: v_readlane_b32 s8, v21, 6 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s7, s8, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x74, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: v_readlane_b32 s5, v21, 10 +; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_readlane_b32 s5, v21, 9 +; VI-NEXT: v_readlane_b32 s6, v21, 8 +; VI-NEXT: s_and_b32 s5, s5, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_readlane_b32 s15, v21, 1 +; VI-NEXT: v_readlane_b32 s13, v21, 3 +; VI-NEXT: v_readlane_b32 s11, v21, 5 +; VI-NEXT: v_readlane_b32 s9, v21, 7 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s87, v20, 31 +; VI-NEXT: v_readlane_b32 s86, v20, 30 +; VI-NEXT: v_readlane_b32 s85, v20, 29 +; VI-NEXT: v_readlane_b32 s84, v20, 28 +; VI-NEXT: v_readlane_b32 s83, v20, 27 +; VI-NEXT: v_readlane_b32 s82, v20, 26 +; VI-NEXT: v_readlane_b32 s81, v20, 25 +; VI-NEXT: v_readlane_b32 s80, v20, 24 +; VI-NEXT: v_readlane_b32 s71, v20, 23 +; VI-NEXT: v_readlane_b32 s70, v20, 22 +; VI-NEXT: v_readlane_b32 s69, v20, 21 +; VI-NEXT: v_readlane_b32 s68, v20, 20 +; VI-NEXT: v_readlane_b32 s67, v20, 19 +; VI-NEXT: v_readlane_b32 s66, v20, 18 +; VI-NEXT: v_readlane_b32 s65, v20, 17 +; VI-NEXT: v_readlane_b32 s64, v20, 16 +; VI-NEXT: v_readlane_b32 s55, v20, 15 +; VI-NEXT: v_readlane_b32 s54, v20, 14 +; VI-NEXT: v_readlane_b32 s53, v20, 13 +; VI-NEXT: v_readlane_b32 s52, v20, 12 +; VI-NEXT: v_readlane_b32 s51, v20, 11 +; VI-NEXT: v_readlane_b32 s50, v20, 10 +; VI-NEXT: v_readlane_b32 s49, v20, 9 +; VI-NEXT: v_readlane_b32 s48, v20, 8 +; VI-NEXT: v_readlane_b32 s39, v20, 7 +; VI-NEXT: v_readlane_b32 s38, v20, 6 +; VI-NEXT: v_readlane_b32 s37, v20, 5 +; VI-NEXT: v_readlane_b32 s36, v20, 4 +; VI-NEXT: v_readlane_b32 s35, v20, 3 +; VI-NEXT: v_readlane_b32 s34, v20, 2 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: v_writelane_b32 v21, s60, 0 +; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: v_writelane_b32 v21, s60, 2 +; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: v_writelane_b32 v21, s60, 4 +; VI-NEXT: v_writelane_b32 v21, s61, 5 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: v_writelane_b32 v21, s60, 6 +; VI-NEXT: v_writelane_b32 v21, s61, 7 +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v16i64_to_v128i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_writelane_b32 v20, s34, 2 +; GFX9-NEXT: v_writelane_b32 v20, s35, 3 +; GFX9-NEXT: v_writelane_b32 v20, s36, 4 +; GFX9-NEXT: v_writelane_b32 v20, s37, 5 +; GFX9-NEXT: v_writelane_b32 v20, s38, 6 +; GFX9-NEXT: v_writelane_b32 v20, s39, 7 +; GFX9-NEXT: v_writelane_b32 v20, s48, 8 +; GFX9-NEXT: v_writelane_b32 v20, s49, 9 +; GFX9-NEXT: v_writelane_b32 v20, s50, 10 +; GFX9-NEXT: v_writelane_b32 v20, s51, 11 +; GFX9-NEXT: v_writelane_b32 v20, s52, 12 +; GFX9-NEXT: v_writelane_b32 v20, s53, 13 +; GFX9-NEXT: v_writelane_b32 v20, s54, 14 +; GFX9-NEXT: v_writelane_b32 v20, s55, 15 +; GFX9-NEXT: v_writelane_b32 v20, s64, 16 +; GFX9-NEXT: v_writelane_b32 v20, s65, 17 +; GFX9-NEXT: v_writelane_b32 v20, s66, 18 +; GFX9-NEXT: v_writelane_b32 v20, s67, 19 +; GFX9-NEXT: v_writelane_b32 v20, s68, 20 +; GFX9-NEXT: v_writelane_b32 v20, s69, 21 +; GFX9-NEXT: v_writelane_b32 v20, s70, 22 +; GFX9-NEXT: v_writelane_b32 v20, s71, 23 +; GFX9-NEXT: v_writelane_b32 v20, s80, 24 +; GFX9-NEXT: v_writelane_b32 v20, s81, 25 +; GFX9-NEXT: v_writelane_b32 v20, s82, 26 +; GFX9-NEXT: v_writelane_b32 v20, s83, 27 +; GFX9-NEXT: v_writelane_b32 v20, s84, 28 +; GFX9-NEXT: v_writelane_b32 v20, s85, 29 +; GFX9-NEXT: v_writelane_b32 v20, s86, 30 +; GFX9-NEXT: v_writelane_b32 v20, s87, 31 +; GFX9-NEXT: v_writelane_b32 v20, s96, 32 +; GFX9-NEXT: v_writelane_b32 v20, s97, 33 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_writelane_b32 v20, s98, 34 +; GFX9-NEXT: v_readfirstlane_b32 s44, v1 +; GFX9-NEXT: v_readfirstlane_b32 s45, v2 +; GFX9-NEXT: v_readfirstlane_b32 s42, v3 +; GFX9-NEXT: v_readfirstlane_b32 s43, v4 +; GFX9-NEXT: v_readfirstlane_b32 s40, v5 +; GFX9-NEXT: v_readfirstlane_b32 s41, v6 +; GFX9-NEXT: v_readfirstlane_b32 s14, v7 +; GFX9-NEXT: v_readfirstlane_b32 s15, v8 +; GFX9-NEXT: v_readfirstlane_b32 s12, v9 +; GFX9-NEXT: v_readfirstlane_b32 s13, v10 +; GFX9-NEXT: v_readfirstlane_b32 s10, v11 +; GFX9-NEXT: v_readfirstlane_b32 s11, v12 +; GFX9-NEXT: v_readfirstlane_b32 s8, v13 +; GFX9-NEXT: v_readfirstlane_b32 s9, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: v_readfirstlane_b32 s7, v16 +; GFX9-NEXT: v_readfirstlane_b32 s4, v17 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v18 +; GFX9-NEXT: v_writelane_b32 v20, s99, 35 +; GFX9-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; GFX9-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s5, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 2 +; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 3 +; GFX9-NEXT: s_lshr_b32 s46, s5, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 4 +; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 5 +; GFX9-NEXT: s_lshr_b32 s46, s4, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 6 +; GFX9-NEXT: s_lshr_b32 s46, s7, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 7 +; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 8 +; GFX9-NEXT: s_lshr_b32 s46, s7, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 9 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 10 +; GFX9-NEXT: s_lshr_b32 s46, s6, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 11 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 12 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s8, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s8, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s11, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s11, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s10, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s10, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s13, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s13, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s12, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s12, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s15, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s15, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s14, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s14, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s41, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s41, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s40, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s40, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s43, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s43, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s43, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s42, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s42, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s45, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 42 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 43 +; GFX9-NEXT: s_lshr_b32 s46, s45, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 44 +; GFX9-NEXT: s_lshr_b32 s46, s44, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 45 +; GFX9-NEXT: s_lshr_b32 s46, s44, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 46 +; GFX9-NEXT: s_lshr_b32 s46, s29, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 47 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s29, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 49 +; GFX9-NEXT: s_lshr_b32 s46, s28, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 50 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 +; GFX9-NEXT: v_writelane_b32 v21, s56, 0 +; GFX9-NEXT: s_lshr_b32 s82, s28, 8 +; GFX9-NEXT: s_lshr_b32 s83, s27, 24 +; GFX9-NEXT: s_lshr_b32 s81, s27, 16 +; GFX9-NEXT: s_lshr_b32 s84, s27, 8 +; GFX9-NEXT: s_lshr_b32 s85, s26, 16 +; GFX9-NEXT: s_lshr_b32 s86, s26, 8 +; GFX9-NEXT: s_lshr_b32 s87, s25, 24 +; GFX9-NEXT: s_lshr_b32 s96, s25, 16 +; GFX9-NEXT: s_lshr_b32 s97, s25, 8 +; GFX9-NEXT: s_lshr_b32 s98, s24, 16 +; GFX9-NEXT: s_lshr_b32 s99, s24, 8 +; GFX9-NEXT: s_lshr_b32 s38, s23, 24 +; GFX9-NEXT: s_lshr_b32 s39, s23, 16 +; GFX9-NEXT: s_lshr_b32 s48, s23, 8 +; GFX9-NEXT: s_lshr_b32 s49, s22, 16 +; GFX9-NEXT: s_lshr_b32 s50, s22, 8 +; GFX9-NEXT: s_lshr_b32 s51, s21, 24 +; GFX9-NEXT: s_lshr_b32 s52, s21, 16 +; GFX9-NEXT: s_lshr_b32 s53, s21, 8 +; GFX9-NEXT: s_lshr_b32 s54, s20, 16 +; GFX9-NEXT: s_lshr_b32 s55, s20, 8 +; GFX9-NEXT: s_lshr_b32 s64, s19, 24 +; GFX9-NEXT: s_lshr_b32 s65, s19, 16 +; GFX9-NEXT: s_lshr_b32 s66, s19, 8 +; GFX9-NEXT: s_lshr_b32 s67, s18, 16 +; GFX9-NEXT: s_lshr_b32 s68, s18, 8 +; GFX9-NEXT: s_lshr_b32 s69, s17, 24 +; GFX9-NEXT: s_lshr_b32 s70, s17, 16 +; GFX9-NEXT: s_lshr_b32 s71, s17, 8 +; GFX9-NEXT: s_lshr_b32 s80, s16, 16 +; GFX9-NEXT: s_lshr_b32 s46, s16, 8 +; GFX9-NEXT: v_writelane_b32 v21, s57, 1 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB57_3 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s44, s44, 3 +; GFX9-NEXT: s_addc_u32 s45, s45, 0 +; GFX9-NEXT: s_add_u32 s42, s42, 3 +; GFX9-NEXT: s_addc_u32 s43, s43, 0 +; GFX9-NEXT: s_add_u32 s40, s40, 3 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s4, s4, 3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_lshr_b32 s46, s5, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 2 +; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 3 +; GFX9-NEXT: s_lshr_b32 s46, s5, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 4 +; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 5 +; GFX9-NEXT: s_lshr_b32 s46, s4, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 6 +; GFX9-NEXT: s_lshr_b32 s46, s7, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 7 +; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 8 +; GFX9-NEXT: s_lshr_b32 s46, s7, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 9 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 10 +; GFX9-NEXT: s_lshr_b32 s46, s6, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 11 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 12 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s8, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s8, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s11, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s11, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s10, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s10, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s13, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s13, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s12, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s12, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s15, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s15, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s14, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s14, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s41, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s41, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s40, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s40, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s43, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s43, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s43, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s42, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s42, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s45, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 42 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 43 +; GFX9-NEXT: s_lshr_b32 s46, s45, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 44 +; GFX9-NEXT: s_lshr_b32 s46, s44, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 45 +; GFX9-NEXT: s_lshr_b32 s46, s44, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 46 +; GFX9-NEXT: s_lshr_b32 s46, s29, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 47 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s29, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 49 +; GFX9-NEXT: s_lshr_b32 s46, s28, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 50 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 +; GFX9-NEXT: v_writelane_b32 v21, s56, 0 +; GFX9-NEXT: s_lshr_b32 s82, s28, 8 +; GFX9-NEXT: s_lshr_b32 s83, s27, 24 +; GFX9-NEXT: s_lshr_b32 s81, s27, 16 +; GFX9-NEXT: s_lshr_b32 s84, s27, 8 +; GFX9-NEXT: s_lshr_b32 s85, s26, 16 +; GFX9-NEXT: s_lshr_b32 s86, s26, 8 +; GFX9-NEXT: s_lshr_b32 s87, s25, 24 +; GFX9-NEXT: s_lshr_b32 s96, s25, 16 +; GFX9-NEXT: s_lshr_b32 s97, s25, 8 +; GFX9-NEXT: s_lshr_b32 s98, s24, 16 +; GFX9-NEXT: s_lshr_b32 s99, s24, 8 +; GFX9-NEXT: s_lshr_b32 s38, s23, 24 +; GFX9-NEXT: s_lshr_b32 s39, s23, 16 +; GFX9-NEXT: s_lshr_b32 s48, s23, 8 +; GFX9-NEXT: s_lshr_b32 s49, s22, 16 +; GFX9-NEXT: s_lshr_b32 s50, s22, 8 +; GFX9-NEXT: s_lshr_b32 s51, s21, 24 +; GFX9-NEXT: s_lshr_b32 s52, s21, 16 +; GFX9-NEXT: s_lshr_b32 s53, s21, 8 +; GFX9-NEXT: s_lshr_b32 s54, s20, 16 +; GFX9-NEXT: s_lshr_b32 s55, s20, 8 +; GFX9-NEXT: s_lshr_b32 s64, s19, 24 +; GFX9-NEXT: s_lshr_b32 s65, s19, 16 +; GFX9-NEXT: s_lshr_b32 s66, s19, 8 +; GFX9-NEXT: s_lshr_b32 s67, s18, 16 +; GFX9-NEXT: s_lshr_b32 s68, s18, 8 +; GFX9-NEXT: s_lshr_b32 s69, s17, 24 +; GFX9-NEXT: s_lshr_b32 s70, s17, 16 +; GFX9-NEXT: s_lshr_b32 s71, s17, 8 +; GFX9-NEXT: s_lshr_b32 s80, s16, 16 +; GFX9-NEXT: s_lshr_b32 s46, s16, 8 +; GFX9-NEXT: v_writelane_b32 v21, s57, 1 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: .LBB57_3: ; %end +; GFX9-NEXT: s_lshl_b32 s46, s46, 8 +; GFX9-NEXT: s_and_b32 s16, s16, 0xff +; GFX9-NEXT: s_or_b32 s16, s16, s46 +; GFX9-NEXT: s_lshl_b32 s46, s36, 8 +; GFX9-NEXT: s_and_b32 s47, s80, 0xff +; GFX9-NEXT: s_or_b32 s46, s47, s46 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s46, s46, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s46 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s71, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s70, 0xff +; GFX9-NEXT: s_lshl_b32 s46, s69, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s46 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: s_lshl_b32 s16, s68, 8 +; GFX9-NEXT: s_and_b32 s17, s18, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s34, 8 +; GFX9-NEXT: s_and_b32 s18, s67, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s16 +; GFX9-NEXT: s_and_b32 s16, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s66, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s65, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s64, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_lshl_b32 s16, s55, 8 +; GFX9-NEXT: s_and_b32 s17, s20, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s30, 8 +; GFX9-NEXT: s_and_b32 s18, s54, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s16 +; GFX9-NEXT: s_and_b32 s16, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s53, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s51, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s16 +; GFX9-NEXT: s_lshl_b32 s16, s50, 8 +; GFX9-NEXT: s_and_b32 s17, s22, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s94, 8 +; GFX9-NEXT: s_and_b32 s18, s49, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v7, s16 +; GFX9-NEXT: s_and_b32 s16, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s48, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s38, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-NEXT: s_lshl_b32 s16, s99, 8 +; GFX9-NEXT: s_and_b32 s17, s24, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s92, 8 +; GFX9-NEXT: s_and_b32 s18, s98, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v9, s16 +; GFX9-NEXT: s_and_b32 s16, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s97, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s96, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s87, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: s_lshl_b32 s16, s86, 8 +; GFX9-NEXT: s_and_b32 s17, s26, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s90, 8 +; GFX9-NEXT: s_and_b32 s18, s85, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v11, s16 +; GFX9-NEXT: s_and_b32 s16, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s84, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s81, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s83, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: s_lshl_b32 s16, s82, 8 +; GFX9-NEXT: s_and_b32 s17, s28, 0xff +; GFX9-NEXT: v_readlane_b32 s18, v21, 50 +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s88, 8 +; GFX9-NEXT: s_and_b32 s18, s18, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 49 +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: s_and_b32 s16, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 48 +; GFX9-NEXT: v_readlane_b32 s18, v21, 47 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 46 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s44, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 45 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s78, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 44 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s45, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 43 +; GFX9-NEXT: v_readlane_b32 s18, v21, 42 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 41 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s42, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 40 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s76, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 39 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s43, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 38 +; GFX9-NEXT: v_readlane_b32 s18, v21, 37 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 36 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s40, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 35 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s74, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 34 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s41, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 33 +; GFX9-NEXT: v_readlane_b32 s18, v21, 32 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: v_readlane_b32 s16, v21, 31 +; GFX9-NEXT: s_and_b32 s14, s14, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s16, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s16 +; GFX9-NEXT: v_readlane_b32 s16, v21, 30 +; GFX9-NEXT: s_and_b32 s16, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s72, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s16, s16, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s16 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s14, s15, 0xff +; GFX9-NEXT: v_readlane_b32 s15, v21, 29 +; GFX9-NEXT: s_lshl_b32 s15, s15, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: v_readlane_b32 s15, v21, 28 +; GFX9-NEXT: v_readlane_b32 s16, v21, 27 +; GFX9-NEXT: s_and_b32 s15, s15, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s16, 8 +; GFX9-NEXT: s_or_b32 s15, s15, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s15, s15, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: v_readlane_b32 s14, v21, 26 +; GFX9-NEXT: s_and_b32 s12, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s14, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s14 +; GFX9-NEXT: v_readlane_b32 s14, v21, 25 +; GFX9-NEXT: s_and_b32 s14, s14, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s62, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s14, s14, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s14 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_and_b32 s12, s13, 0xff +; GFX9-NEXT: v_readlane_b32 s13, v21, 24 +; GFX9-NEXT: s_lshl_b32 s13, s13, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: v_readlane_b32 s13, v21, 23 +; GFX9-NEXT: v_readlane_b32 s14, v21, 22 +; GFX9-NEXT: s_and_b32 s13, s13, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s14, 8 +; GFX9-NEXT: s_or_b32 s13, s13, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s13, s13, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_readlane_b32 s12, v21, 21 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s12, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s12 +; GFX9-NEXT: v_readlane_b32 s12, v21, 20 +; GFX9-NEXT: s_and_b32 s12, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s60, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s12, s12, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s12 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_and_b32 s10, s11, 0xff +; GFX9-NEXT: v_readlane_b32 s11, v21, 19 +; GFX9-NEXT: s_lshl_b32 s11, s11, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: v_readlane_b32 s11, v21, 18 +; GFX9-NEXT: v_readlane_b32 s12, v21, 17 +; GFX9-NEXT: s_and_b32 s11, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s12, 8 +; GFX9-NEXT: s_or_b32 s11, s11, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s11, s11, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_readlane_b32 s10, v21, 16 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s10 +; GFX9-NEXT: v_readlane_b32 s10, v21, 15 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s58, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NEXT: s_or_b32 s8, s8, s10 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s8, s9, 0xff +; GFX9-NEXT: v_readlane_b32 s9, v21, 14 +; GFX9-NEXT: s_lshl_b32 s9, s9, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: v_readlane_b32 s9, v21, 13 +; GFX9-NEXT: v_readlane_b32 s10, v21, 12 +; GFX9-NEXT: s_and_b32 s9, s9, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_readlane_b32 s8, v21, 11 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_readlane_b32 s8, v21, 10 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s56, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s7, 0xff +; GFX9-NEXT: v_readlane_b32 s7, v21, 9 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_readlane_b32 s7, v21, 8 +; GFX9-NEXT: v_readlane_b32 s8, v21, 7 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_readlane_b32 s6, v21, 6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: v_readlane_b32 s6, v21, 5 +; GFX9-NEXT: v_readlane_b32 s8, v21, 0 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s8, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_and_b32 s4, s5, 0xff +; GFX9-NEXT: v_readlane_b32 s5, v21, 4 +; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_readlane_b32 s5, v21, 3 +; GFX9-NEXT: v_readlane_b32 s6, v21, 2 +; GFX9-NEXT: s_and_b32 s5, s5, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_readlane_b32 s9, v21, 1 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: v_readlane_b32 s99, v20, 35 +; GFX9-NEXT: v_readlane_b32 s98, v20, 34 +; GFX9-NEXT: v_readlane_b32 s97, v20, 33 +; GFX9-NEXT: v_readlane_b32 s96, v20, 32 +; GFX9-NEXT: v_readlane_b32 s87, v20, 31 +; GFX9-NEXT: v_readlane_b32 s86, v20, 30 +; GFX9-NEXT: v_readlane_b32 s85, v20, 29 +; GFX9-NEXT: v_readlane_b32 s84, v20, 28 +; GFX9-NEXT: v_readlane_b32 s83, v20, 27 +; GFX9-NEXT: v_readlane_b32 s82, v20, 26 +; GFX9-NEXT: v_readlane_b32 s81, v20, 25 +; GFX9-NEXT: v_readlane_b32 s80, v20, 24 +; GFX9-NEXT: v_readlane_b32 s71, v20, 23 +; GFX9-NEXT: v_readlane_b32 s70, v20, 22 +; GFX9-NEXT: v_readlane_b32 s69, v20, 21 +; GFX9-NEXT: v_readlane_b32 s68, v20, 20 +; GFX9-NEXT: v_readlane_b32 s67, v20, 19 +; GFX9-NEXT: v_readlane_b32 s66, v20, 18 +; GFX9-NEXT: v_readlane_b32 s65, v20, 17 +; GFX9-NEXT: v_readlane_b32 s64, v20, 16 +; GFX9-NEXT: v_readlane_b32 s55, v20, 15 +; GFX9-NEXT: v_readlane_b32 s54, v20, 14 +; GFX9-NEXT: v_readlane_b32 s53, v20, 13 +; GFX9-NEXT: v_readlane_b32 s52, v20, 12 +; GFX9-NEXT: v_readlane_b32 s51, v20, 11 +; GFX9-NEXT: v_readlane_b32 s50, v20, 10 +; GFX9-NEXT: v_readlane_b32 s49, v20, 9 +; GFX9-NEXT: v_readlane_b32 s48, v20, 8 +; GFX9-NEXT: v_readlane_b32 s39, v20, 7 +; GFX9-NEXT: v_readlane_b32 s38, v20, 6 +; GFX9-NEXT: v_readlane_b32 s37, v20, 5 +; GFX9-NEXT: v_readlane_b32 s36, v20, 4 +; GFX9-NEXT: v_readlane_b32 s35, v20, 3 +; GFX9-NEXT: v_readlane_b32 s34, v20, 2 +; GFX9-NEXT: v_readlane_b32 s31, v20, 1 +; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: v_writelane_b32 v21, s82, 0 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr99 +; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr96 +; GFX9-NEXT: ; implicit-def: $sgpr87 +; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr85 +; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: v_writelane_b32 v21, s83, 1 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: s_branch .LBB57_2 +; +; GFX11-TRUE16-LABEL: bitcast_v16i64_to_v128i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v16, s32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v17, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v18, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v19, s32 offset:12 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s30, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s96, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s97, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s98, 2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s99, 3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s100, 4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s101, 5 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s102, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s103, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s104, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s50, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s51, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s52, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s53, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s54, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s55, 15 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s64, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s65, 17 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s66, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s67, 19 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s68, 20 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s69, 21 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s70, 22 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s71, 23 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s80, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s81, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s82, 26 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s83, 27 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s84, 28 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s85, 29 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s86, 30 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s87, 31 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[4:5], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s15, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s29, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s28, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 9 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s27, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s27, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 7 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s26, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s25, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 26 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 5 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[10:11], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s25, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 25 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s24, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s22, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s21, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s19, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s18, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 19 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s17, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s1, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s0, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[12:13], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[14:15], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[72:73], s[28:29], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[26:27], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[20:21], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 11 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s57 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_3 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s40, s40, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s41, s41, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s14, s14, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s15, s15, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s12, s12, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s10, s10, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s15, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[4:5], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s29, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s28, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s28, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s27, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 9 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s26, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 7 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s26, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s25, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 26 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 5 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[10:11], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s25, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 25 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s24, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s23, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s21, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s20, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 19 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s16, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s1, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s0, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[12:13], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[14:15], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[72:73], s[28:29], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[24:25], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 11 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-TRUE16-NEXT: .LBB57_3: ; %end +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s99 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s30 +; GFX11-TRUE16-NEXT: s_and_b32 s57, s57, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s56, s57, s56 +; GFX11-TRUE16-NEXT: s_lshl_b32 s47, s47, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s47 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s94 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s98 +; GFX11-TRUE16-NEXT: s_and_b32 s46, s46, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s47, s47, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s46, s46, s47 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s97 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s96 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s46, s46, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s56, s56, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s57, s57, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s87 +; GFX11-TRUE16-NEXT: s_or_b32 s56, s56, s57 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s46, s46, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s86 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s56 +; GFX11-TRUE16-NEXT: s_and_b32 s46, s46, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s45, s45, 8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s45, s46, s45 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s45, s45, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s45 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s92 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s85 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s84 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s43 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s83 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s82 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s18 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s42 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s71 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s90 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s81 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s16, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s0 :: v_dual_mov_b32 v8, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s70 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s80 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s2 :: v_dual_mov_b32 v10, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s88 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s69 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s68 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s66 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s67 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s64 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s22 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s65 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s55 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s76 +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s53 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s23 +; GFX11-TRUE16-NEXT: s_lshl_b32 s18, s18, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s54 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s17, s17, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s17, s17, s18 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s16, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s17, 16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[3:6], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s52 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s51 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s78 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s50 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s49 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s48 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s39 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s38 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s74 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s37 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s36 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s35 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s18, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s19, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-TRUE16-NEXT: s_or_b32 s17, s18, s19 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s34 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s104 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s72 +; GFX11-TRUE16-NEXT: v_readlane_b32 s17, v18, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s103 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s102 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s16, v18, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s101 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s62 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s17, v18, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s18, v18, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s19, v18, 4 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s18, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s19, s19, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-TRUE16-NEXT: s_or_b32 s17, s18, s19 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s1, v18, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s2, v18, 6 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s60 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s14, v18, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s100 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s15, s15, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s14, s15 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s12, v18, 10 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s58 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s12, s12, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s12, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s13, v18, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s14, v18, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s15, v18, 13 +; GFX11-TRUE16-NEXT: s_and_b32 s12, s12, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s13, s13, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s15, s15, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s12, s12, s13 +; GFX11-TRUE16-NEXT: s_or_b32 s13, s14, s15 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s12, s12, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s13, s13, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s12, s13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s12, v19, 2 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s1, v18, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s2, v18, 15 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s12 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v18, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v18, 18 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s10, s11 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s8, v18, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v19, 4 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v19, 5 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s8, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s9, v18, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v18, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v18, 23 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s8, s9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s8, v19, 6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s1, v18, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s2, v18, 25 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s6, v18, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s7, v18, 28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s6, s7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s4, v18, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s6, v19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: v_readlane_b32 s7, v19, 9 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s5, v18, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s6, v19, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s7, v19, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s13, v19, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s9, v19, 7 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:112 +; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v17, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v17, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v17, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v17, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v17, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v17, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v17, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v17, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v17, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v16, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v16, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v16, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v16, 28 +; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v16, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v16, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v16, 25 +; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v16, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v16, 23 +; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v16, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v16, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v16, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v16, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v16, 18 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v16, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v16, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v16, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v16, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v16, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v16, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v16, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v16, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v16, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v16, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v16, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v16, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v16, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v16, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v16, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v16, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v16, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v16, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_load_b32 v16, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v17, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v18, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v19, off, s32 offset:12 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 10 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 11 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 12 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 13 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 14 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 15 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 17 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 18 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 19 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 20 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 21 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 22 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 23 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 24 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 25 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 26 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 27 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 28 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 29 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 30 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 31 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; +; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v16, s32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v17, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v18, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v19, s32 offset:12 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s30, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s96, 0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s97, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s98, 2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s99, 3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s100, 4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s101, 5 +; GFX11-FAKE16-NEXT: s_mov_b32 s101, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s102, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s103, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s104, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s49, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s50, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s51, 11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s52, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s53, 13 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s54, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s55, 15 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s64, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s65, 17 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s66, 18 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s67, 19 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s68, 20 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s69, 21 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s70, 22 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s71, 23 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s80, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s81, 25 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s82, 26 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s83, 27 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s84, 28 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s85, 29 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s86, 30 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s87, 31 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s5, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s5, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s7, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s7, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s6, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s9, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s9, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s8, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s11, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s10, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s13, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s13, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s12, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s12, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s15, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s15, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s15, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s14, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s41, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s41, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s41, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s40, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s40, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s29, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s28, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s28, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s27, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s26, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s25, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s2, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 15 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s1, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s1, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 13 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 11 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 9 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 6 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 7 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 5 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 1 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_branch .LBB57_3 +; GFX11-FAKE16-NEXT: .LBB57_2: +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: s_mov_b32 s101, -1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 0 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 1 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 2 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 3 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 5 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 7 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 9 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 11 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 13 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 15 +; GFX11-FAKE16-NEXT: .LBB57_3: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s101 +; GFX11-FAKE16-NEXT: s_mov_b32 s101, s104 +; GFX11-FAKE16-NEXT: s_mov_b32 s104, s57 +; GFX11-FAKE16-NEXT: s_mov_b32 s57, s69 +; GFX11-FAKE16-NEXT: s_mov_b32 s69, s42 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_5 +; GFX11-FAKE16-NEXT: ; %bb.4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s40, s40, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s41, s41, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s14, s14, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s15, s15, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s12, s12, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s5, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s5, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s7, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s7, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s9, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s9, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s8, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s11, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s10, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s13, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s13, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s12, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s12, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s15, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s15, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s15, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s14, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s41, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s41, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s41, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s40, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s40, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s29, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s28, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s28, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s27, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s26, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s25, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s2, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 15 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s1, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 13 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s1, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 11 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 9 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 6 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 7 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 5 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 1 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX11-FAKE16-NEXT: .LBB57_5: ; %end +; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s42, s74, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s43 +; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s94, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s42, s42, s43 +; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s45, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s42 +; GFX11-FAKE16-NEXT: v_readlane_b32 s42, v18, 9 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s30, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s44, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s44, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s42 +; GFX11-FAKE16-NEXT: v_readlane_b32 s42, v18, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s43, v18, 7 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s44 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s100, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s98, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s44 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s99, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s42, s42, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s44, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s42, s42, s43 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s44 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v18, 6 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s42 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v18, 2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s92, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v18, 0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 29 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v17, 4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v17, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v19, 28 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s18, v19, 19 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s90, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 31 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s19, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 30 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s78, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s18, s18, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s86, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 21 +; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v17, 2 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 22 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 26 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v16, 30 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 25 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v16, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v16, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s1, v19, 18 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s62, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 20 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s16, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s17, s17, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s17, s17, s18 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s17, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s97, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 17 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s88, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s69, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s18, s72, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v17, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v16, 21 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s25, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s73, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s96, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s76, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s27, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s87, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-FAKE16-NEXT: s_or_b32 s17, s18, s19 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s85, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s84, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s16, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 1 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s29, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s83, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s82, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s81, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s18, v19, 2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s40, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s61, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s80, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s18, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s19, v19, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s41, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s60, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s18, s71, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s70, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-FAKE16-NEXT: s_or_b32 s17, s18, s19 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s58, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s59, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s16, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s15, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s68, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s14, s67, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s66, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s14, s15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s14, v19, 6 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s12, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s65, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s12, s64, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s15, v19, 7 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s12, s14 +; GFX11-FAKE16-NEXT: s_and_b32 s12, s13, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s13, s55, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s14, s54, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s53, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s12, s12, s13 +; GFX11-FAKE16-NEXT: s_or_b32 s13, s14, s15 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s12, s12, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s13, s13, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s12, s13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s12, v19, 8 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s10, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s52, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s51, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s12, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s11, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s50, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s49, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s48, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s10, s11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s10, v19, 10 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s8, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s39, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s38, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s11, v19, 11 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s8, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s37, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s36, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s35, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s8, s9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s8, v19, 12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s6, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s56, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s57, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s7, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s34, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_hi, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s46, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s6, s7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s6, v19, 14 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s4, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s47, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s104, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s7, v19, 15 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s5, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s103, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s102, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s101, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:64 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s13, v19, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s9, v19, 13 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 +; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v17, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v17, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v17, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v17, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v17, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v16, 31 +; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v16, 29 +; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v16, 28 +; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v16, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v16, 26 +; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v16, 25 +; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v16, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v16, 23 +; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v16, 22 +; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v16, 20 +; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v16, 19 +; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v16, 18 +; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v16, 17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v16, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v16, 15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v16, 14 +; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v16, 13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v16, 12 +; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v16, 11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v16, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v16, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v16, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v16, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v16, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v16, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v16, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v16, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v16, 2 +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v16, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v17, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v18, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v19, off, s32 offset:12 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v128i8_to_v16i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:388 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v60, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v58, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v62, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v61, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v34, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v1 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:372 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v39 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v56 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v47 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v54 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v46 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v37 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v48 -; GCN-NEXT: v_or_b32_e32 v8, v8, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v9, v53 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v10, v42 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v41 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v40 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v63 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v50 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v18, v51 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v19, v49 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v20, v60 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v21, v58 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v22, v22, v62 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_or_b32_e32 v23, v23, v59 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_or_b32_e32 v25, v25, v61 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_or_b32_e32 v26, v26, v34 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v27, v27, v52 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_or_b32_e32 v28, v28, v35 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v33 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v30, v30, v44 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v57 -; GCN-NEXT: v_or_b32_e32 v31, v31, v36 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v36, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v37, v38, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v48, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v55, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v54, v49 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v50 -; GCN-NEXT: v_or_b32_e32 v19, v19, v51 -; GCN-NEXT: v_or_b32_e32 v20, v20, v52 -; GCN-NEXT: v_or_b32_e32 v21, v21, v53 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v23, v23, v33 -; GCN-NEXT: v_or_b32_e32 v24, v24, v34 -; GCN-NEXT: v_or_b32_e32 v25, v25, v35 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v37 -; GCN-NEXT: v_or_b32_e32 v28, v28, v38 -; GCN-NEXT: v_or_b32_e32 v29, v29, v39 -; GCN-NEXT: v_or_b32_e32 v30, v30, v48 -; GCN-NEXT: v_or_b32_e32 v31, v31, v49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB29_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v39, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v56, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v47, v3 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v54, v4 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v46, v5 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v37, v6 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v45, v8 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v53, v9 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v42, v10 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v41, v11 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v40, v12 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v63, v13 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v50, v14 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v0, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v0, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v0, v17 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v51, v18 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v49, v19 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v60, v20 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v58, v21 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v25, v62, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v29, v59, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v37, v32, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v50, v61, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v41, v34, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v45, v52, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v56, v35, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v58, v33, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v59, v44, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v57 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v57, v36, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v60, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v61, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v54, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v32, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v48, v35 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v53, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_mov_b32_e32 v0, v55 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v55, v53 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v55, v40, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v40, v42, v40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v44, v43 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v44, v46, v44 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v0, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v47, v0, v47 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v60, v0 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v61, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v63, v3 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v21 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s7, v37 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s7, v50 -; GCN-NEXT: v_add_i32_e32 v41, vcc, s7, v41 -; GCN-NEXT: v_add_i32_e32 v45, vcc, s7, v45 -; GCN-NEXT: v_add_i32_e32 v56, vcc, s7, v56 -; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v59 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x300, v57 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v56 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 -; GCN-NEXT: v_or_b32_e32 v4, v36, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v39, v6 -; GCN-NEXT: v_or_b32_e32 v7, v49, v7 -; GCN-NEXT: v_or_b32_e32 v8, v51, v8 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: v_or_b32_e32 v10, v54, v10 -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: v_or_b32_e32 v12, v23, v12 -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: v_or_b32_e32 v15, v27, v15 -; GCN-NEXT: v_or_b32_e32 v16, v28, v16 -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: v_or_b32_e32 v18, v31, v18 -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: v_or_b32_e32 v20, v33, v20 -; GCN-NEXT: v_or_b32_e32 v21, v34, v21 -; GCN-NEXT: v_or_b32_e32 v22, v35, v25 -; GCN-NEXT: v_or_b32_e32 v23, v48, v29 -; GCN-NEXT: v_or_b32_e32 v24, v53, v37 -; GCN-NEXT: v_or_b32_e32 v25, v55, v50 -; GCN-NEXT: v_or_b32_e32 v26, v40, v41 -; GCN-NEXT: v_or_b32_e32 v27, v42, v45 -; GCN-NEXT: v_or_b32_e32 v28, v43, v56 -; GCN-NEXT: v_or_b32_e32 v29, v44, v58 -; GCN-NEXT: v_or_b32_e32 v30, v46, v59 -; GCN-NEXT: v_or_b32_e32 v31, v47, v57 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 -; GCN-NEXT: .LBB29_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v128i8_to_v16i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v46 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v40 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v62 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v57 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v44 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v36 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: .LBB58_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v46, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 +; SI-NEXT: .LBB58_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v16i64: ; VI: ; %bb.0: @@ -44026,7 +89958,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload @@ -44499,9 +90431,9 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB29_2: ; %Flow +; VI-NEXT: .LBB58_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_4 +; VI-NEXT: s_cbranch_execz .LBB58_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload @@ -44890,7 +90822,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v31, v32, v31 -; VI-NEXT: .LBB29_4: ; %end +; VI-NEXT: .LBB58_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -45262,7 +91194,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload @@ -45736,9 +91668,9 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: .LBB29_2: ; %Flow +; GFX9-NEXT: .LBB58_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_4 +; GFX9-NEXT: s_cbranch_execz .LBB58_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload @@ -46133,7 +92065,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 -; GFX9-NEXT: .LBB29_4: ; %end +; GFX9-NEXT: .LBB58_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -46375,15 +92307,15 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_4 -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_4 +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB29_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h @@ -46540,68 +92472,554 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 +; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 @@ -46612,551 +93030,5376 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 -; GFX11-TRUE16-NEXT: .LBB29_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:20 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v124 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v125 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v126 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v127 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v111 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v121 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v120 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v122 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v123 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v107 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v108 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v109 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v110 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v106 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v6, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v92 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v78 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v76 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v94 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v95 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v104 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v105 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v79 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v89 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v91 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: .LBB58_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v55, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v54, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v53, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v52, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v51, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v50, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v124, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v125, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v126, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v127, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v49, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v48, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v37, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v36, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v35, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v39, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v34, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v111, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v120, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v121, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v122, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v107, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v108, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v109, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v38, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v110, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v106, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v7, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v33, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v32, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v92, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v78, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v77, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v76, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v75, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v74, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v60, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v59, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v93, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v94, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v95, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v104, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v105, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v79, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v88, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v89, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v90, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v91, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v58, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v62, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v63, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v72, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v73, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v45, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v46, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v47, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v56, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v57, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-FAKE16-NEXT: .LBB58_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v128i8_to_v16i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB59_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: v_mov_b32_e32 v43, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_mov_b32_e32 v55, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_mov_b32_e32 v44, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 +; SI-NEXT: v_mov_b32_e32 v59, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_mov_b32_e32 v54, v37 +; SI-NEXT: v_mov_b32_e32 v37, v41 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v38, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB59_3 +; SI-NEXT: .LBB59_2: +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_mov_b32_e32 v54, v37 +; SI-NEXT: v_mov_b32_e32 v37, v41 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: .LBB59_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v63, v46 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB59_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB59_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v128i8_to_v16i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v15 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:124 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:172 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:180 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:204 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:240 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:212 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:236 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:252 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:268 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB59_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v44, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v59 +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v53, v63 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v15 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v47, v39 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v52, v48 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v45, v36 +; VI-NEXT: v_mov_b32_e32 v40, v21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v57, v1 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v49, v38 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v23, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 +; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v58, v2 +; VI-NEXT: v_mov_b32_e32 v32, v36 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v46 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v42, v48 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_mov_b32_e32 v61, v39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB59_3 +; VI-NEXT: .LBB59_2: +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v53, v63 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v45, v36 +; VI-NEXT: v_mov_b32_e32 v47, v39 +; VI-NEXT: v_mov_b32_e32 v49, v38 +; VI-NEXT: v_mov_b32_e32 v44, v24 +; VI-NEXT: v_mov_b32_e32 v40, v21 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: v_mov_b32_e32 v43, v59 +; VI-NEXT: v_mov_b32_e32 v52, v48 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: .LBB59_3: ; %Flow +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB59_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v41 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v35 +; VI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 24, v52 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_or_b32_e32 v30, v30, v31 +; VI-NEXT: v_lshlrev_b32_e32 v28, 24, v53 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s9, s17, 8 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s18, 0xff +; VI-NEXT: s_lshl_b32 s8, s19, 24 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s21, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s9, s20, 0xff +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s22, 0xff +; VI-NEXT: s_lshl_b32 s6, s23, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s9 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s5, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v63 +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s8, s8, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v42 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v44 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v40 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v47 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v56 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v61 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v33 +; VI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v48, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v27, 24, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v50 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v43 +; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v33, v51, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v26, 24, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v49, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v50, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v32 +; VI-NEXT: v_or_b32_sdwa v25, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_and_b32_e32 v33, 0xff, v49 +; VI-NEXT: v_or_b32_sdwa v32, v58, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_or_b32_sdwa v26, v26, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v33, 0xff, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_and_b32_e32 v33, 0xff, v39 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_and_b32_e32 v33, 0xff, v37 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v28, v28, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v30, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: .LBB59_5: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v128i8_to_v16i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v4 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(42) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB59_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v38, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v61, v38 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v63, v57 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v37, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v60 +; GFX9-NEXT: v_mov_b32_e32 v52, v56 +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_mov_b32_e32 v34, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v53, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB59_3 +; GFX9-NEXT: .LBB59_2: +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_mov_b32_e32 v63, v57 +; GFX9-NEXT: v_mov_b32_e32 v53, v3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: v_mov_b32_e32 v57, v38 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: .LBB59_3: ; %Flow +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB59_5 +; GFX9-NEXT: ; %bb.4: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v61 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v63 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v46 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB59_5: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:436 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64 +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_3 +; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v22, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v27, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v30, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB59_3: ; %end +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:436 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB59_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB59_2 ; -; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64: +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 +; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 @@ -47166,983 +98409,5564 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:20 -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v53 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v91 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v124 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v125 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v126 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v127 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v2, 0xff, v52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_3 +; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v55 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v54 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v52 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v90, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v92, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v93, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v88, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v74, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v75, v12 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v49 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v79, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v63, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v72, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v73, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v111 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v121 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v120 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v122 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v123 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v107 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v38 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v108 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v109 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v110 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v106 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v6, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v92 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v78 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v77 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v76 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v75 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v74 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v60 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v94 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v95 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v104 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v105 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v79 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v89 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v90 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v91 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v72 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v46 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v47 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v56 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v57 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v117, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v113, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %Flow +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v34, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: .LBB59_3: ; %end +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 +; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB59_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB59_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v16i64_to_v64bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v63 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: .LBB60_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 +; SI-NEXT: v_addc_u32_e32 v32, vcc, 0, v62, vcc +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: .LBB60_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i64_to_v64bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB60_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: .LBB60_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i64_to_v64bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB60_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: .LBB60_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i64_to_v64bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB60_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: .LBB60_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i64_to_v64bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s36, 4 +; SI-NEXT: v_writelane_b32 v20, s37, 5 +; SI-NEXT: v_writelane_b32 v20, s38, 6 +; SI-NEXT: v_writelane_b32 v20, s39, 7 +; SI-NEXT: v_writelane_b32 v20, s48, 8 +; SI-NEXT: v_writelane_b32 v20, s49, 9 +; SI-NEXT: v_writelane_b32 v20, s50, 10 +; SI-NEXT: v_writelane_b32 v20, s51, 11 +; SI-NEXT: v_writelane_b32 v20, s52, 12 +; SI-NEXT: v_writelane_b32 v20, s53, 13 +; SI-NEXT: v_writelane_b32 v20, s54, 14 +; SI-NEXT: v_writelane_b32 v20, s55, 15 +; SI-NEXT: v_writelane_b32 v20, s64, 16 +; SI-NEXT: v_writelane_b32 v20, s65, 17 +; SI-NEXT: v_writelane_b32 v20, s66, 18 +; SI-NEXT: v_writelane_b32 v20, s67, 19 +; SI-NEXT: v_writelane_b32 v20, s68, 20 +; SI-NEXT: v_writelane_b32 v20, s69, 21 +; SI-NEXT: v_writelane_b32 v20, s70, 22 +; SI-NEXT: v_writelane_b32 v20, s71, 23 +; SI-NEXT: v_writelane_b32 v20, s80, 24 +; SI-NEXT: v_writelane_b32 v20, s81, 25 +; SI-NEXT: v_writelane_b32 v20, s82, 26 +; SI-NEXT: v_writelane_b32 v20, s83, 27 +; SI-NEXT: v_writelane_b32 v20, s84, 28 +; SI-NEXT: v_writelane_b32 v20, s85, 29 +; SI-NEXT: v_writelane_b32 v20, s86, 30 +; SI-NEXT: v_writelane_b32 v20, s87, 31 +; SI-NEXT: v_writelane_b32 v20, s96, 32 +; SI-NEXT: v_writelane_b32 v20, s97, 33 +; SI-NEXT: v_writelane_b32 v20, s98, 34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v20, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s70, v1 +; SI-NEXT: v_readfirstlane_b32 s71, v2 +; SI-NEXT: v_readfirstlane_b32 s80, v3 +; SI-NEXT: v_readfirstlane_b32 s81, v4 +; SI-NEXT: v_readfirstlane_b32 s82, v5 +; SI-NEXT: v_readfirstlane_b32 s83, v6 +; SI-NEXT: v_readfirstlane_b32 s84, v7 +; SI-NEXT: v_readfirstlane_b32 s85, v8 +; SI-NEXT: v_readfirstlane_b32 s86, v9 +; SI-NEXT: v_readfirstlane_b32 s87, v10 +; SI-NEXT: v_readfirstlane_b32 s96, v11 +; SI-NEXT: v_readfirstlane_b32 s97, v12 +; SI-NEXT: v_readfirstlane_b32 s98, v13 +; SI-NEXT: v_readfirstlane_b32 s99, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s8, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; SI-NEXT: s_cbranch_scc0 .LBB61_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s9, 0xffff0000 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v21, s4, 0 +; SI-NEXT: s_lshl_b32 s4, s9, 16 +; SI-NEXT: v_writelane_b32 v21, s4, 1 +; SI-NEXT: s_and_b32 s4, s8, 0xffff0000 +; SI-NEXT: v_writelane_b32 v21, s4, 2 +; SI-NEXT: s_lshl_b32 s4, s8, 16 +; SI-NEXT: v_writelane_b32 v21, s4, 3 +; SI-NEXT: s_and_b32 s11, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s7, 16 +; SI-NEXT: s_and_b32 s13, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s6, 16 +; SI-NEXT: s_and_b32 s15, s99, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s99, 16 +; SI-NEXT: s_and_b32 s41, s98, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s98, 16 +; SI-NEXT: s_and_b32 s43, s97, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s97, 16 +; SI-NEXT: s_and_b32 s45, s96, 0xffff0000 +; SI-NEXT: s_lshl_b32 s44, s96, 16 +; SI-NEXT: s_and_b32 s47, s87, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s87, 16 +; SI-NEXT: s_and_b32 s57, s86, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s86, 16 +; SI-NEXT: s_and_b32 s59, s85, 0xffff0000 +; SI-NEXT: s_lshl_b32 s58, s85, 16 +; SI-NEXT: s_and_b32 s61, s84, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s84, 16 +; SI-NEXT: s_and_b32 s63, s83, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s83, 16 +; SI-NEXT: s_and_b32 s73, s82, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s82, 16 +; SI-NEXT: s_and_b32 s75, s81, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s81, 16 +; SI-NEXT: s_and_b32 s77, s80, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s80, 16 +; SI-NEXT: s_and_b32 s79, s71, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s71, 16 +; SI-NEXT: s_and_b32 s89, s70, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s70, 16 +; SI-NEXT: s_and_b32 s91, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s29, 16 +; SI-NEXT: s_and_b32 s93, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s28, 16 +; SI-NEXT: s_and_b32 s95, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s27, 16 +; SI-NEXT: s_and_b32 s31, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s30, s26, 16 +; SI-NEXT: s_and_b32 s35, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s34, s25, 16 +; SI-NEXT: s_and_b32 s37, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s24, 16 +; SI-NEXT: s_and_b32 s39, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s38, s23, 16 +; SI-NEXT: s_and_b32 s49, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s48, s22, 16 +; SI-NEXT: s_and_b32 s51, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s50, s21, 16 +; SI-NEXT: s_and_b32 s53, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s52, s20, 16 +; SI-NEXT: s_and_b32 s55, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s54, s19, 16 +; SI-NEXT: s_and_b32 s65, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s18, 16 +; SI-NEXT: s_and_b32 s67, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s66, s17, 16 +; SI-NEXT: s_and_b32 s69, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s68, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB61_3 +; SI-NEXT: .LBB61_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_add_u32 s16, s18, 3 +; SI-NEXT: s_addc_u32 s17, s19, 0 +; SI-NEXT: s_add_u32 s18, s20, 3 +; SI-NEXT: s_addc_u32 s19, s21, 0 +; SI-NEXT: s_add_u32 s20, s22, 3 +; SI-NEXT: s_addc_u32 s21, s23, 0 +; SI-NEXT: s_add_u32 s22, s24, 3 +; SI-NEXT: s_addc_u32 s23, s25, 0 +; SI-NEXT: s_add_u32 s24, s26, 3 +; SI-NEXT: s_addc_u32 s25, s27, 0 +; SI-NEXT: s_add_u32 s26, s28, 3 +; SI-NEXT: s_addc_u32 s27, s29, 0 +; SI-NEXT: s_add_u32 s28, s70, 3 +; SI-NEXT: s_addc_u32 s29, s71, 0 +; SI-NEXT: s_add_u32 s76, s80, 3 +; SI-NEXT: s_addc_u32 s74, s81, 0 +; SI-NEXT: s_add_u32 s72, s82, 3 +; SI-NEXT: s_addc_u32 s62, s83, 0 +; SI-NEXT: s_add_u32 s60, s84, 3 +; SI-NEXT: s_addc_u32 s58, s85, 0 +; SI-NEXT: s_add_u32 s56, s86, 3 +; SI-NEXT: s_addc_u32 s46, s87, 0 +; SI-NEXT: s_add_u32 s44, s96, 3 +; SI-NEXT: s_addc_u32 s42, s97, 0 +; SI-NEXT: s_add_u32 s40, s98, 3 +; SI-NEXT: s_addc_u32 s14, s99, 0 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_and_b32 s10, s9, 0xffff0000 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v21, s10, 0 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_writelane_b32 v21, s9, 1 +; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 +; SI-NEXT: v_writelane_b32 v21, s9, 2 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s11, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s7, 16 +; SI-NEXT: s_and_b32 s13, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s6, 16 +; SI-NEXT: s_and_b32 s15, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_and_b32 s41, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s40, 16 +; SI-NEXT: s_and_b32 s43, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s42, 16 +; SI-NEXT: s_and_b32 s45, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: s_and_b32 s47, s46, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_and_b32 s57, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s56, 16 +; SI-NEXT: s_and_b32 s59, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s58, s58, 16 +; SI-NEXT: s_and_b32 s61, s60, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s60, 16 +; SI-NEXT: s_and_b32 s63, s62, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s62, 16 +; SI-NEXT: s_and_b32 s73, s72, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s72, 16 +; SI-NEXT: s_and_b32 s75, s74, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s74, 16 +; SI-NEXT: s_and_b32 s77, s76, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s76, 16 +; SI-NEXT: s_and_b32 s79, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s29, 16 +; SI-NEXT: s_and_b32 s89, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s28, 16 +; SI-NEXT: s_and_b32 s91, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s27, 16 +; SI-NEXT: s_and_b32 s93, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s26, 16 +; SI-NEXT: s_and_b32 s95, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s25, 16 +; SI-NEXT: s_and_b32 s31, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s30, s24, 16 +; SI-NEXT: s_and_b32 s35, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s34, s23, 16 +; SI-NEXT: s_and_b32 s37, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s22, 16 +; SI-NEXT: s_and_b32 s39, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s38, s21, 16 +; SI-NEXT: s_and_b32 s49, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s48, s20, 16 +; SI-NEXT: s_and_b32 s51, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s50, s19, 16 +; SI-NEXT: s_and_b32 s53, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s52, s18, 16 +; SI-NEXT: s_and_b32 s55, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s54, s17, 16 +; SI-NEXT: s_and_b32 s65, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s16, 16 +; SI-NEXT: s_and_b32 s67, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s66, s5, 16 +; SI-NEXT: s_and_b32 s69, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s68, s4, 16 +; SI-NEXT: v_writelane_b32 v21, s8, 3 +; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s68 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s66 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s54 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s52 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s38 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s36 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s94 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s62 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s58 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s56 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s46 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s44 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s40 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s14 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s12 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s10 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: v_readlane_b32 s4, v21, 2 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_readlane_b32 s4, v21, 3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: v_readlane_b32 s4, v21, 0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_readlane_b32 s4, v21, 1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s99, v20, 35 +; SI-NEXT: v_readlane_b32 s98, v20, 34 +; SI-NEXT: v_readlane_b32 s97, v20, 33 +; SI-NEXT: v_readlane_b32 s96, v20, 32 +; SI-NEXT: v_readlane_b32 s87, v20, 31 +; SI-NEXT: v_readlane_b32 s86, v20, 30 +; SI-NEXT: v_readlane_b32 s85, v20, 29 +; SI-NEXT: v_readlane_b32 s84, v20, 28 +; SI-NEXT: v_readlane_b32 s83, v20, 27 +; SI-NEXT: v_readlane_b32 s82, v20, 26 +; SI-NEXT: v_readlane_b32 s81, v20, 25 +; SI-NEXT: v_readlane_b32 s80, v20, 24 +; SI-NEXT: v_readlane_b32 s71, v20, 23 +; SI-NEXT: v_readlane_b32 s70, v20, 22 +; SI-NEXT: v_readlane_b32 s69, v20, 21 +; SI-NEXT: v_readlane_b32 s68, v20, 20 +; SI-NEXT: v_readlane_b32 s67, v20, 19 +; SI-NEXT: v_readlane_b32 s66, v20, 18 +; SI-NEXT: v_readlane_b32 s65, v20, 17 +; SI-NEXT: v_readlane_b32 s64, v20, 16 +; SI-NEXT: v_readlane_b32 s55, v20, 15 +; SI-NEXT: v_readlane_b32 s54, v20, 14 +; SI-NEXT: v_readlane_b32 s53, v20, 13 +; SI-NEXT: v_readlane_b32 s52, v20, 12 +; SI-NEXT: v_readlane_b32 s51, v20, 11 +; SI-NEXT: v_readlane_b32 s50, v20, 10 +; SI-NEXT: v_readlane_b32 s49, v20, 9 +; SI-NEXT: v_readlane_b32 s48, v20, 8 +; SI-NEXT: v_readlane_b32 s39, v20, 7 +; SI-NEXT: v_readlane_b32 s38, v20, 6 +; SI-NEXT: v_readlane_b32 s37, v20, 5 +; SI-NEXT: v_readlane_b32 s36, v20, 4 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: s_branch .LBB61_2 +; +; VI-LABEL: bitcast_v16i64_to_v64bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB61_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB61_3 +; VI-NEXT: .LBB61_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: .LBB61_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB61_4: +; VI-NEXT: s_branch .LBB61_2 +; +; GFX9-LABEL: bitcast_v16i64_to_v64bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB61_3 +; GFX9-NEXT: .LBB61_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v32, vcc, 3, v32 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: .LBB61_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB61_4: +; GFX9-NEXT: s_branch .LBB61_2 +; +; GFX11-LABEL: bitcast_v16i64_to_v64bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB61_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB61_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB61_3: +; GFX11-NEXT: .LBB61_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { +; SI-LABEL: bitcast_v64bf16_to_v16i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 +; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16 +; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16 +; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16 +; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16 +; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB62_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: .LBB62_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v16i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB62_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: .LBB62_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v16i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB62_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 +; GFX9-NEXT: .LBB62_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v32.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v32, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v14, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v38, v13, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v12, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v32, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v11, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v33 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v10, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v9, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v8, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v34 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v31, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v31.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v30, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v30 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v31, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v30.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v30, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v29, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v28, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v28, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v28 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v29.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v27, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v29, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v28.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v27, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v27.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v26, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v27, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v26, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v25, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v24, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v26, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v25.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v24, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v24 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v25, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v24.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v24, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v23, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v23.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v21, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v23, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v22.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v21, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v21.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v18 +; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v21, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v19, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v20.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v39, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v19.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v18, 0x7fff +; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v20, v32 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v16 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v19, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v17, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v37, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v16, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v16, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v17.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v18, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v17, v35 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v36, v16 +; GFX11-TRUE16-NEXT: .LBB62_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v55, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v54, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v53, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v52, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v51, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v50, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v124, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v125, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v126, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v127, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v49, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v48, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v37, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v36, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v35, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v39, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v34, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v111, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v120, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v121, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v122, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v107, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v108, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v109, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v38, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v110, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v106, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v7, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v33, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v32, 3 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB62_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v32, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v39, v14, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v13, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v12, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v11, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v8, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v9, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v6, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v6, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v7, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v92, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v78, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v77, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v76, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v75, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v74, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v60, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v59, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v93, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v94, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v95, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v104, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v105, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v79, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v88, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v89, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v90, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v91, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v58, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v62, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v63, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v72, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v73, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v45, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v46, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v47, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v56, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v57, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 -; GFX11-FAKE16-NEXT: .LBB29_4: ; %end +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v30, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v29 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v30, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v29, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v28, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v27, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v26, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v23, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v21, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v21, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v38, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v19, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v18, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v17, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v17 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v36, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v16, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 +; GFX11-FAKE16-NEXT: .LBB62_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <128 x i8> %a, splat (i8 3) - %a2 = bitcast <128 x i8> %a1 to <16 x i64> + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <16 x i64> br label %end cmp.false: - %a3 = bitcast <128 x i8> %a to <16 x i64> + %a3 = bitcast <64 x bfloat> %a to <16 x i64> br label %end end: @@ -48150,3788 +103974,3616 @@ end: ret <16 x i64> %phi } -define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i64_to_v64bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB30_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v62 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: .LBB30_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB30_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_addc_u32_e32 v32, vcc, 0, v62, vcc -; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; GCN-NEXT: .LBB30_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v59 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v56, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v46, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v45, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v44, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 20, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_alignbit_b32 v32, v32, v33, 16 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_alignbit_b32 v34, v34, v35, 16 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_alignbit_b32 v36, v36, v37, 16 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_alignbit_b32 v38, v38, v39, 16 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_alignbit_b32 v48, v48, v49, 16 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_alignbit_b32 v50, v50, v51, 16 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_alignbit_b32 v52, v52, v53, 16 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_alignbit_b32 v54, v54, v55, 16 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_alignbit_b32 v40, v40, v41, 16 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v63 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_alignbit_b32 v42, v42, v43, 16 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v56, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v16i64_to_v64bf16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB30_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: .LBB30_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v16i64_to_v64bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB30_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc -; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 -; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: .LBB30_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v16i64_to_v64bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB30_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo -; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: .LBB30_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <16 x i64> %a, splat (i64 3) - %a2 = bitcast <16 x i64> %a1 to <64 x bfloat> - br label %end - -cmp.false: - %a3 = bitcast <16 x i64> %a to <64 x bfloat> - br label %end - -end: - %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <64 x bfloat> %phi -} - -define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v64bf16_to_v16i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v33 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v42 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v63 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v0 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB31_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; GCN-NEXT: v_alignbit_b32 v0, v0, v32, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v61, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; GCN-NEXT: v_alignbit_b32 v2, v2, v59, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; GCN-NEXT: v_alignbit_b32 v3, v3, v57, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v46 -; GCN-NEXT: v_alignbit_b32 v4, v4, v47, 16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v5, v45, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v33, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v43 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v19, v19, v42, 16 -; GCN-NEXT: v_alignbit_b32 v20, v20, v44, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; GCN-NEXT: v_alignbit_b32 v22, v22, v48, 16 -; GCN-NEXT: v_alignbit_b32 v23, v23, v38, 16 -; GCN-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; GCN-NEXT: v_alignbit_b32 v25, v25, v51, 16 -; GCN-NEXT: v_alignbit_b32 v26, v26, v53, 16 -; GCN-NEXT: v_alignbit_b32 v27, v27, v55, 16 -; GCN-NEXT: v_alignbit_b32 v28, v28, v40, 16 -; GCN-NEXT: v_alignbit_b32 v29, v29, v63, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; kill: killed $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: .LBB31_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB31_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v46 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v34 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v42 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v44 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v63 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v54 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v41 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v43 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v40, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v44, 0x40c00000, v31 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v32 -; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v34 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v36 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v38 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v48 -; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v49 -; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v50 -; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v51 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v52 -; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; GCN-NEXT: v_alignbit_b32 v19, v21, v20, 16 -; GCN-NEXT: v_alignbit_b32 v20, v39, v54, 16 -; GCN-NEXT: v_alignbit_b32 v21, v48, v40, 16 -; GCN-NEXT: v_alignbit_b32 v22, v49, v22, 16 -; GCN-NEXT: v_alignbit_b32 v23, v50, v23, 16 -; GCN-NEXT: v_alignbit_b32 v24, v51, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v32, v25, 16 -; GCN-NEXT: v_alignbit_b32 v26, v33, v26, 16 -; GCN-NEXT: v_alignbit_b32 v27, v34, v27, 16 -; GCN-NEXT: v_alignbit_b32 v28, v35, v28, 16 -; GCN-NEXT: v_alignbit_b32 v29, v36, v29, 16 -; GCN-NEXT: v_alignbit_b32 v30, v37, v30, 16 -; GCN-NEXT: v_alignbit_b32 v31, v38, v31, 16 -; GCN-NEXT: .LBB31_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64bf16_to_v16i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v14, v11 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: v_mov_b32_e32 v57, v11 +; SI-NEXT: v_mov_b32_e32 v47, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 +; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 +; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 +; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v35, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v11 +; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v56, v11 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v12 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v14 +; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v14 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v53, v38 +; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 +; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: .LBB63_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB63_4: +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_mov_b32_e32 v57, v11 +; SI-NEXT: v_mov_b32_e32 v47, v10 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB63_2 ; -; VI-LABEL: bitcast_v64bf16_to_v16i64: +; VI-LABEL: bitcast_v64bf16_to_v16i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB31_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB63_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB63_3 +; VI-NEXT: .LBB63_2: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: s_movk_i32 s6, 0x7fff ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 -; VI-NEXT: .LBB31_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v64bf16_to_v16i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB31_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: .LBB63_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB63_4: +; VI-NEXT: s_branch .LBB63_2 +; +; GFX9-LABEL: bitcast_v64bf16_to_v16i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB63_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB63_3 +; GFX9-NEXT: .LBB63_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v15 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; GFX9-NEXT: s_mov_b32 s7, 0x7060302 -; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff +; GFX9-NEXT: v_and_b32_sdwa v15, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v33, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v14 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v14, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v14, v33, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v13 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v33, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v12 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v12, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v33, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v11 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v33, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v10 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v10, v18, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v33, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v9 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v33, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v8 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v8, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v33, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v7 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v7, v18, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v33, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v6 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v5 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v5, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v33, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v4 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v33, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v3 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v33, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v2 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v2, v33, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v1 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v33, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v31 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v31 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v31, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v31, v33, 16, v31 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v30 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v30, v18, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v29 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v29 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v29, v18, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v29, v33, 16, v29 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v28 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v28 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v28 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v28, v18, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v28, v33, 16, v28 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v27 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v27 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v27 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v27, v18, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v27, v33, 16, v27 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v26 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v26 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v26 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v26, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v26, v33, 16, v26 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v25 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v25 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v25 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v25, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v25, v33, 16, v25 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v24 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v24 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v24 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v24, v18, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v24, v33, 16, v24 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v23 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v23 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v23 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v23, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v23, v33, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v22 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v22 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v22, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v22, v33, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v21 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v21 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v21 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v21, v18, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v21, v33, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v20 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v20 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v20, v33, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v19 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v19 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v19, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v19, v33, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v32 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v32 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v32, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v32, v33, 16, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v17, v33, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 -; GFX9-NEXT: .LBB31_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v16, v18, 16, v16 +; GFX9-NEXT: .LBB63_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB63_4: +; GFX9-NEXT: s_branch .LBB63_2 ; -; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64: +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB31_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v33, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v32.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v15, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37 -; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v14 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v32, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v14, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v33 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v38, v13, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 +; GFX11-TRUE16-NEXT: s_clause 0x8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v183, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v182, v9 :: v_dual_mov_b32 v169, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v176, v3 :: v_dual_mov_b32 v175, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v173, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB63_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB63_3 +; GFX11-TRUE16-NEXT: .LBB63_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s5, s27, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s27, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v12, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v32, v35, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v11, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v11, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v33 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v10, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v10, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v32 -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v9, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v8, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v8, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 +; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v23, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v24, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v26, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 +; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 +; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v30, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v34 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v31, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v31.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v30, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v30 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v29 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v31, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v30.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v29, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v30, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v29, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v28, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v28, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v28 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v29.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v27, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v29, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v28.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v27, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v27.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v26, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v27, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v26, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v32, v36, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v25, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v25, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v183 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v26.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v36, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v24, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v26, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v25.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v24, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v24 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v25, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v182 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v183, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v182, v32, 16, v33 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v177 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v177 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v23, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v24, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v23, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v22, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v22 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v23.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v21, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v23, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v22.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v177, v33, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v33 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v32, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v20 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v21.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v18 -; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v21, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v176 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v19, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v19, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v176, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v39, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v19.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v181 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v36, v49, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v18, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v20, v32 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v48, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v16 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v19, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v17, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v17, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v16, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v16, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v17.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 +; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v50, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v18, v34 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v48, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v17, v35 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v36, v16 -; GFX11-TRUE16-NEXT: .LBB31_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16i64: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB31_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v14, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v32, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v39, v14, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v15, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v15 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v14 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v13, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v13, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v12, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v11, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v11, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v8, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v8, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v9, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v6, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v6, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v7, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v5 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v31 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v181, v39, 16, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-TRUE16-NEXT: .LBB63_3: ; %end +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v181 :: v_dual_mov_b32 v19, v175 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v21, v176 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 +; GFX11-TRUE16-NEXT: s_clause 0x8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v177 :: v_dual_mov_b32 v27, v182 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v183 :: v_dual_mov_b32 v31, v178 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v179 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB63_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-TRUE16-NEXT: s_branch .LBB63_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 +; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB63_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB63_3 +; GFX11-FAKE16-NEXT: .LBB63_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_and_b32 s5, s27, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v31, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 +; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v30, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v29 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v30, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v29, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v28, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v27 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v28, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v27, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v26, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v25 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v26, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v24, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v23 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v23, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v22 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v21, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v21, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v38, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v19, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v19, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v19 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v18, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v18, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v16 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v17, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v17 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v16, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v36, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v16, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB31_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-FAKE16-NEXT: .LBB63_3: ; %end +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 +; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB63_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-FAKE16-NEXT: s_branch .LBB63_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -51950,775 +107602,808 @@ end: } define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i64_to_v64f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB32_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v62 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v63 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v38 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: .LBB32_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB32_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v32, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v34, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v36, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v38, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v39, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v48, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v50, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v51, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v52, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v63 -; GCN-NEXT: v_addc_u32_e32 v54, vcc, 0, v62, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v52 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v51 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v50 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v49 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v48 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v39 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v38 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v37 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v36 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v35 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v34 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v33 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v32 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v31 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: .LBB32_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v59 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v59, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v58, v2, v1 -; GCN-NEXT: v_add_i32_e32 v62, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v57, v2, v1 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v56 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v56, v2, v1 -; GCN-NEXT: v_add_i32_e32 v60, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v47 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v46 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v45 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v44 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v43 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v42 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v41 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v55 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v54 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v53 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v52 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v51 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v50 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v49 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v48 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v38 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v34, v30 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v37 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v35, v38, v35 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v33, v39, v33 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v48, v32 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v49, v31 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v59, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v61, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v60, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i64_to_v64f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v29 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB64_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v35, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; SI-NEXT: v_addc_u32_e32 v44, vcc, 0, v62, vcc +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v6 +; SI-NEXT: v_mov_b32_e32 v50, v29 +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v46, v28 +; SI-NEXT: v_mov_b32_e32 v34, v8 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: .LBB64_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i64_to_v64f16: ; VI: ; %bb.0: @@ -52730,7 +108415,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB32_2 +; VI-NEXT: s_cbranch_execz .LBB64_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc @@ -52765,7 +108450,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 ; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: .LBB32_2: ; %end +; VI-NEXT: .LBB64_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -52780,7 +108465,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB32_2 +; GFX9-NEXT: s_cbranch_execz .LBB64_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc @@ -52815,7 +108500,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: .LBB32_2: ; %end +; GFX9-NEXT: .LBB64_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -52832,7 +108517,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-NEXT: s_cbranch_execz .LBB64_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -52875,7 +108560,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: .LBB32_2: ; %end +; GFX11-NEXT: .LBB64_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -52896,768 +108581,1637 @@ end: ret <64 x half> %phi } +define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i64_to_v64f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_readfirstlane_b32 s46, v1 +; SI-NEXT: v_readfirstlane_b32 s47, v2 +; SI-NEXT: v_readfirstlane_b32 s44, v3 +; SI-NEXT: v_readfirstlane_b32 s45, v4 +; SI-NEXT: v_readfirstlane_b32 s42, v5 +; SI-NEXT: v_readfirstlane_b32 s43, v6 +; SI-NEXT: v_readfirstlane_b32 s40, v7 +; SI-NEXT: v_readfirstlane_b32 s41, v8 +; SI-NEXT: v_readfirstlane_b32 s14, v9 +; SI-NEXT: v_readfirstlane_b32 s15, v10 +; SI-NEXT: v_readfirstlane_b32 s12, v11 +; SI-NEXT: v_readfirstlane_b32 s13, v12 +; SI-NEXT: v_readfirstlane_b32 s10, v13 +; SI-NEXT: v_readfirstlane_b32 s11, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v16 +; SI-NEXT: v_readfirstlane_b32 s6, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s45, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s44, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s47, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s46, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: .LBB65_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s16, s4, 16 +; SI-NEXT: s_lshr_b32 s17, s5, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s56, s18, 16 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s58, s20, 16 +; SI-NEXT: s_lshr_b32 s59, s21, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s60, s22, 16 +; SI-NEXT: s_lshr_b32 s61, s23, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s62, s24, 16 +; SI-NEXT: s_lshr_b32 s63, s25, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s72, s26, 16 +; SI-NEXT: s_lshr_b32 s73, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s74, s28, 16 +; SI-NEXT: s_lshr_b32 s75, s29, 16 +; SI-NEXT: s_add_u32 s46, s46, 3 +; SI-NEXT: s_addc_u32 s47, s47, 0 +; SI-NEXT: s_lshr_b32 s76, s46, 16 +; SI-NEXT: s_lshr_b32 s77, s47, 16 +; SI-NEXT: s_add_u32 s44, s44, 3 +; SI-NEXT: s_addc_u32 s45, s45, 0 +; SI-NEXT: s_lshr_b32 s78, s44, 16 +; SI-NEXT: s_lshr_b32 s79, s45, 16 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_lshr_b32 s88, s42, 16 +; SI-NEXT: s_lshr_b32 s89, s43, 16 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_lshr_b32 s90, s40, 16 +; SI-NEXT: s_lshr_b32 s91, s41, 16 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_lshr_b32 s92, s14, 16 +; SI-NEXT: s_lshr_b32 s93, s15, 16 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_lshr_b32 s94, s12, 16 +; SI-NEXT: s_lshr_b32 s95, s13, 16 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_lshr_b32 vcc_lo, s10, 16 +; SI-NEXT: s_lshr_b32 vcc_hi, s11, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s30, s7, 16 +; SI-NEXT: s_lshr_b32 s31, s8, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_lshr_b32 s35, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s35 +; SI-NEXT: s_lshr_b32 s34, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, s34 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, s31 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s30 +; SI-NEXT: v_cvt_f32_f16_e32 v6, vcc_hi +; SI-NEXT: v_cvt_f32_f16_e32 v9, vcc_lo +; SI-NEXT: v_cvt_f32_f16_e32 v11, s95 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s16 +; SI-NEXT: .LBB65_3: ; %end +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v61 +; SI-NEXT: v_or_b32_e32 v2, v2, v61 +; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v57, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v57, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v47, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v45, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v45, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v43, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v43, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v41, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v55, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v53, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v51, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v48, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v48, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v38, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v38, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v36, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v36, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v34, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v32, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v30, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB65_4: +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB65_2 +; +; VI-LABEL: bitcast_v16i64_to_v64f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB65_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB65_3 +; VI-NEXT: .LBB65_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: .LBB65_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB65_4: +; VI-NEXT: s_branch .LBB65_2 +; +; GFX9-LABEL: bitcast_v16i64_to_v64f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB65_3 +; GFX9-NEXT: .LBB65_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v32, vcc, 3, v32 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: .LBB65_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB65_4: +; GFX9-NEXT: s_branch .LBB65_2 +; +; GFX11-LABEL: bitcast_v16i64_to_v64f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB65_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB65_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB65_3: +; GFX11-NEXT: .LBB65_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v64f16_to_v16i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v43 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v50 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v37 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v1 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB33_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v61 -; GCN-NEXT: v_or_b32_e32 v0, v62, v0 -; GCN-NEXT: v_or_b32_e32 v1, v60, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; GCN-NEXT: v_or_b32_e32 v2, v58, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v57 -; GCN-NEXT: v_or_b32_e32 v3, v56, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v47 -; GCN-NEXT: v_or_b32_e32 v4, v46, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v51 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v44 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v32 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v37 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: v_or_b32_e32 v19, v43, v19 -; GCN-NEXT: v_or_b32_e32 v20, v41, v20 -; GCN-NEXT: v_or_b32_e32 v21, v55, v21 -; GCN-NEXT: v_or_b32_e32 v22, v49, v22 -; GCN-NEXT: v_or_b32_e32 v23, v50, v23 -; GCN-NEXT: v_or_b32_e32 v24, v39, v24 -; GCN-NEXT: v_or_b32_e32 v25, v36, v25 -; GCN-NEXT: v_or_b32_e32 v26, v48, v26 -; GCN-NEXT: v_or_b32_e32 v27, v52, v27 -; GCN-NEXT: v_or_b32_e32 v28, v53, v28 -; GCN-NEXT: v_or_b32_e32 v29, v54, v29 -; GCN-NEXT: v_or_b32_e32 v30, v40, v30 -; GCN-NEXT: v_or_b32_e32 v31, v42, v31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: .LBB33_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB33_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v58 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v56 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v51 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v41 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v55 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v50 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v39 -; GCN-NEXT: v_mov_b32_e32 v39, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v48 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_or_b32_e32 v19, v21, v20 -; GCN-NEXT: v_or_b32_e32 v20, v55, v39 -; GCN-NEXT: v_or_b32_e32 v21, v41, v48 -; GCN-NEXT: v_or_b32_e32 v22, v22, v49 -; GCN-NEXT: v_or_b32_e32 v23, v23, v50 -; GCN-NEXT: v_or_b32_e32 v24, v24, v51 -; GCN-NEXT: v_or_b32_e32 v25, v25, v32 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v38 -; GCN-NEXT: v_or_b32_e32 v28, v28, v33 -; GCN-NEXT: v_or_b32_e32 v29, v29, v34 -; GCN-NEXT: v_or_b32_e32 v30, v30, v35 -; GCN-NEXT: v_or_b32_e32 v31, v31, v37 -; GCN-NEXT: .LBB33_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64f16_to_v16i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v36 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v58, v2 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_or_b32_e32 v25, v51, v25 +; SI-NEXT: v_or_b32_e32 v26, v48, v26 +; SI-NEXT: v_or_b32_e32 v27, v52, v27 +; SI-NEXT: v_or_b32_e32 v28, v39, v28 +; SI-NEXT: v_or_b32_e32 v29, v37, v29 +; SI-NEXT: v_or_b32_e32 v30, v35, v30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: v_or_b32_e32 v24, v55, v24 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB66_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v37 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v52 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: .LBB66_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v16i64: ; VI: ; %bb.0: @@ -53669,7 +110223,7 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB33_2 +; VI-NEXT: s_cbranch_execz .LBB66_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 0x200 ; VI-NEXT: v_add_f16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -53769,7 +110323,7 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 ; VI-NEXT: v_or_b32_e32 v17, v17, v33 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB33_2: ; %end +; VI-NEXT: .LBB66_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -53784,7 +110338,7 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB33_2 +; GFX9-NEXT: s_cbranch_execz .LBB66_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -53820,7 +110374,7 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB33_2: ; %end +; GFX9-NEXT: .LBB66_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -53837,7 +110391,7 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -53872,7 +110426,7 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB33_2: ; %end +; GFX11-NEXT: .LBB66_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -53893,368 +110447,1540 @@ end: ret <16 x i64> %phi } +define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64f16_to_v16i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v26 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v41, v11 +; SI-NEXT: v_mov_b32_e32 v40, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v37 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v62 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v63 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB67_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_mov_b32_e32 v33, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_or_b32_e32 v22, v51, v22 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v50, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v49, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v48, v25 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v26, v39, v26 +; SI-NEXT: v_mov_b32_e32 v39, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v27, v38, v27 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v28, v37, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_or_b32_e32 v9, v14, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_or_b32_e32 v19, v54, v19 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v2, v60, v2 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_or_b32_e32 v13, v57, v13 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v15, v43, v15 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v40, v17 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_or_b32_e32 v18, v55, v18 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB67_3 +; SI-NEXT: .LBB67_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_mov_b32_e32 v33, v52 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_mov_b32_e32 v39, v27 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB67_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v35, v40 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v40, v46 +; SI-NEXT: v_mov_b32_e32 v41, v56 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v43, v60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB67_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_mov_b32_e32 v55, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: .LBB67_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v16i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB67_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB67_3 +; VI-NEXT: .LBB67_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v18, 0x200 +; VI-NEXT: v_add_f16_sdwa v33, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v33 +; VI-NEXT: v_add_f16_sdwa v33, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v33 +; VI-NEXT: v_add_f16_sdwa v33, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v33 +; VI-NEXT: v_add_f16_sdwa v33, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v33 +; VI-NEXT: v_add_f16_sdwa v33, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v33 +; VI-NEXT: v_add_f16_sdwa v33, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v33 +; VI-NEXT: v_add_f16_sdwa v33, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v33 +; VI-NEXT: v_add_f16_sdwa v33, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v33 +; VI-NEXT: v_add_f16_sdwa v33, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v33 +; VI-NEXT: v_add_f16_sdwa v33, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v33 +; VI-NEXT: v_add_f16_sdwa v33, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v33 +; VI-NEXT: v_add_f16_sdwa v33, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v33 +; VI-NEXT: v_add_f16_sdwa v33, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v33 +; VI-NEXT: v_add_f16_sdwa v33, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v33 +; VI-NEXT: v_add_f16_sdwa v33, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v33 +; VI-NEXT: v_add_f16_sdwa v33, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v33 +; VI-NEXT: v_add_f16_sdwa v33, v31, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_or_b32_e32 v31, v31, v33 +; VI-NEXT: v_add_f16_sdwa v33, v30, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_or_b32_e32 v30, v30, v33 +; VI-NEXT: v_add_f16_sdwa v33, v29, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_f16_sdwa v33, v28, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_add_f16_sdwa v33, v27, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_add_f16_sdwa v33, v26, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_add_f16_sdwa v33, v25, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v33 +; VI-NEXT: v_add_f16_sdwa v33, v24, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v33 +; VI-NEXT: v_add_f16_sdwa v33, v23, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v33 +; VI-NEXT: v_add_f16_sdwa v33, v22, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v33 +; VI-NEXT: v_add_f16_sdwa v33, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_or_b32_e32 v21, v21, v33 +; VI-NEXT: v_add_f16_sdwa v33, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_or_b32_e32 v20, v20, v33 +; VI-NEXT: v_add_f16_sdwa v33, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_or_b32_e32 v19, v19, v33 +; VI-NEXT: v_add_f16_sdwa v33, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v32, v32, v33 +; VI-NEXT: v_add_f16_sdwa v33, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v18, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v17, v33 +; VI-NEXT: v_or_b32_e32 v16, v16, v18 +; VI-NEXT: .LBB67_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB67_4: +; VI-NEXT: s_branch .LBB67_2 +; +; GFX9-LABEL: bitcast_v64f16_to_v16i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB67_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB67_3 +; GFX9-NEXT: .LBB67_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v31, v31, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, v32, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB67_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB67_4: +; GFX9-NEXT: s_branch .LBB67_2 +; +; GFX11-LABEL: bitcast_v64f16_to_v16i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB67_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB67_3 +; GFX11-NEXT: .LBB67_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB67_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB67_4: +; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: s_branch .LBB67_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i64_to_v64i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v49, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v51, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v54, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v40, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v45, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: .LBB34_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc -; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v49, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v51, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v54, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v40, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v45, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: .LBB34_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v59 -; GCN-NEXT: v_or_b32_e32 v1, v1, v44 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; GCN-NEXT: v_or_b32_e32 v59, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_or_b32_e32 v57, v1, v2 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; GCN-NEXT: v_or_b32_e32 v63, v1, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; GCN-NEXT: v_or_b32_e32 v45, v1, v3 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v43 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v61 -; GCN-NEXT: v_or_b32_e32 v61, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v40 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v60 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v54 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v58 -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v51 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v56 -; GCN-NEXT: v_or_b32_e32 v14, v14, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v15, v15, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v47 -; GCN-NEXT: v_or_b32_e32 v16, v16, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v17, v17, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v46 -; GCN-NEXT: v_or_b32_e32 v18, v18, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v19, v19, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v20, v20, v34 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v38 -; GCN-NEXT: v_or_b32_e32 v21, v21, v34 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v42 -; GCN-NEXT: v_or_b32_e32 v22, v22, v34 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v37 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v41 -; GCN-NEXT: v_or_b32_e32 v24, v24, v34 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v36 -; GCN-NEXT: v_or_b32_e32 v25, v25, v34 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v55 -; GCN-NEXT: v_or_b32_e32 v26, v26, v34 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v35 -; GCN-NEXT: v_or_b32_e32 v27, v27, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v53 -; GCN-NEXT: v_or_b32_e32 v28, v28, v34 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v29, v29, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v30, v30, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v32, v32, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v31, v31, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v59, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i64_to_v64i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v49, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v51, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v40, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v42, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v57, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: .LBB68_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v49, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v51, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v40, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v42, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v57, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: .LBB68_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i64_to_v64i16: ; VI: ; %bb.0: @@ -54266,7 +111992,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: s_cbranch_execz .LBB68_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc @@ -54301,7 +112027,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 ; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: .LBB34_2: ; %end +; VI-NEXT: .LBB68_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -54316,7 +112042,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB34_2 +; GFX9-NEXT: s_cbranch_execz .LBB68_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc @@ -54351,7 +112077,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: .LBB34_2: ; %end +; GFX9-NEXT: .LBB68_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -54368,7 +112094,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-NEXT: s_cbranch_execz .LBB68_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -54411,7 +112137,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: .LBB34_2: ; %end +; GFX11-NEXT: .LBB68_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -54432,613 +112158,1206 @@ end: ret <64 x i16> %phi } +define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i64_to_v64i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_readfirstlane_b32 s47, v1 +; SI-NEXT: v_readfirstlane_b32 s46, v2 +; SI-NEXT: v_readfirstlane_b32 s45, v3 +; SI-NEXT: v_readfirstlane_b32 s44, v4 +; SI-NEXT: v_readfirstlane_b32 s43, v5 +; SI-NEXT: v_readfirstlane_b32 s42, v6 +; SI-NEXT: v_readfirstlane_b32 s41, v7 +; SI-NEXT: v_readfirstlane_b32 s40, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v16 +; SI-NEXT: v_readfirstlane_b32 s7, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s45 +; SI-NEXT: v_mov_b32_e32 v9, s47 +; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_mov_b32_e32 v11, s26 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s18 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s46, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s29, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s27, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s25, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s23, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s21, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, s19, v15, 16 +; SI-NEXT: v_alignbit_b32 v16, s17, v16, 16 +; SI-NEXT: s_lshr_b32 s56, s6, 16 +; SI-NEXT: s_lshr_b32 s57, s8, 16 +; SI-NEXT: s_lshr_b32 s58, s10, 16 +; SI-NEXT: s_lshr_b32 s59, s12, 16 +; SI-NEXT: s_lshr_b32 s60, s14, 16 +; SI-NEXT: s_lshr_b32 s61, s40, 16 +; SI-NEXT: s_lshr_b32 s62, s42, 16 +; SI-NEXT: s_lshr_b32 s63, s44, 16 +; SI-NEXT: s_lshr_b32 s72, s46, 16 +; SI-NEXT: s_lshr_b32 s73, s29, 16 +; SI-NEXT: s_lshr_b32 s74, s27, 16 +; SI-NEXT: s_lshr_b32 s75, s25, 16 +; SI-NEXT: s_lshr_b32 s76, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s21, 16 +; SI-NEXT: s_lshr_b32 s78, s19, 16 +; SI-NEXT: s_lshr_b32 s79, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: .LBB69_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s47, s47, 3 +; SI-NEXT: s_addc_u32 s46, s46, 0 +; SI-NEXT: s_add_u32 s45, s45, 3 +; SI-NEXT: s_addc_u32 s44, s44, 0 +; SI-NEXT: s_add_u32 s43, s43, 3 +; SI-NEXT: s_addc_u32 s42, s42, 0 +; SI-NEXT: s_add_u32 s41, s41, 3 +; SI-NEXT: s_addc_u32 s40, s40, 0 +; SI-NEXT: s_add_u32 s15, s15, 3 +; SI-NEXT: s_addc_u32 s14, s14, 0 +; SI-NEXT: s_add_u32 s13, s13, 3 +; SI-NEXT: s_addc_u32 s12, s12, 0 +; SI-NEXT: s_add_u32 s11, s11, 3 +; SI-NEXT: s_addc_u32 s10, s10, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s45 +; SI-NEXT: v_mov_b32_e32 v9, s47 +; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_mov_b32_e32 v11, s26 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s18 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s46, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s29, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s27, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s25, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s23, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s21, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, s19, v15, 16 +; SI-NEXT: v_alignbit_b32 v16, s17, v16, 16 +; SI-NEXT: s_lshr_b32 s56, s6, 16 +; SI-NEXT: s_lshr_b32 s57, s8, 16 +; SI-NEXT: s_lshr_b32 s58, s10, 16 +; SI-NEXT: s_lshr_b32 s59, s12, 16 +; SI-NEXT: s_lshr_b32 s60, s14, 16 +; SI-NEXT: s_lshr_b32 s61, s40, 16 +; SI-NEXT: s_lshr_b32 s62, s42, 16 +; SI-NEXT: s_lshr_b32 s63, s44, 16 +; SI-NEXT: s_lshr_b32 s72, s46, 16 +; SI-NEXT: s_lshr_b32 s73, s29, 16 +; SI-NEXT: s_lshr_b32 s74, s27, 16 +; SI-NEXT: s_lshr_b32 s75, s25, 16 +; SI-NEXT: s_lshr_b32 s76, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s21, 16 +; SI-NEXT: s_lshr_b32 s78, s19, 16 +; SI-NEXT: s_lshr_b32 s79, s17, 16 +; SI-NEXT: .LBB69_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, s4, v16 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s79, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v16, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v16, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s77, 16 +; SI-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v15, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v13, s4, v13 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s75, 16 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s47, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s46, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s45, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s44, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s42, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB69_4: +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: s_branch .LBB69_2 +; +; VI-LABEL: bitcast_v16i64_to_v64i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB69_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB69_3 +; VI-NEXT: .LBB69_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: .LBB69_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB69_4: +; VI-NEXT: s_branch .LBB69_2 +; +; GFX9-LABEL: bitcast_v16i64_to_v64i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB69_3 +; GFX9-NEXT: .LBB69_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v32, vcc, 3, v32 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: .LBB69_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB69_4: +; GFX9-NEXT: s_branch .LBB69_2 +; +; GFX11-LABEL: bitcast_v16i64_to_v64i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB69_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB69_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB69_3: +; GFX11-NEXT: .LBB69_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i16_to_v16i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(12) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v24 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB35_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v36 -; GCN-NEXT: v_or_b32_e32 v1, v1, v58 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v57 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v60 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v43 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v56 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v62 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v61 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v47 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v44 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v30, v32 -; GCN-NEXT: v_or_b32_e32 v31, v31, v59 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: .LBB35_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB35_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v36, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v58, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v57, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v35, v3 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v61 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v4, v60, v4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v32, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v32, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v32, v15 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v32, v16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v32, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v32, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v32, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v32, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v32, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v32, v30 -; GCN-NEXT: v_or_b32_e32 v31, v59, v31 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 -; GCN-NEXT: .LBB35_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(11) -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i16_to_v16i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v23, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v53 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v19, v19, v48 +; SI-NEXT: v_or_b32_e32 v21, v21, v36 +; SI-NEXT: v_or_b32_e32 v22, v22, v34 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v49 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v62 +; SI-NEXT: v_or_b32_e32 v2, v2, v61 +; SI-NEXT: v_or_b32_e32 v3, v3, v60 +; SI-NEXT: v_or_b32_e32 v4, v4, v59 +; SI-NEXT: v_or_b32_e32 v5, v5, v58 +; SI-NEXT: v_or_b32_e32 v6, v6, v57 +; SI-NEXT: v_or_b32_e32 v7, v7, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v47 +; SI-NEXT: v_or_b32_e32 v9, v9, v46 +; SI-NEXT: v_or_b32_e32 v10, v10, v45 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_or_b32_e32 v12, v12, v42 +; SI-NEXT: v_or_b32_e32 v13, v13, v41 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v54 +; SI-NEXT: v_or_b32_e32 v20, v20, v38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: .LBB70_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: v_or_b32_e32 v21, v36, v21 +; SI-NEXT: v_or_b32_e32 v22, v34, v22 +; SI-NEXT: v_or_b32_e32 v23, v32, v23 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_or_b32_e32 v4, v59, v4 +; SI-NEXT: v_or_b32_e32 v5, v58, v5 +; SI-NEXT: v_or_b32_e32 v6, v57, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v47, v8 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v10, v45, v10 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_or_b32_e32 v13, v41, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v54, v15 +; SI-NEXT: v_or_b32_e32 v18, v49, v18 +; SI-NEXT: v_or_b32_e32 v20, v38, v20 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v51, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v37, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 +; SI-NEXT: .LBB70_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v16i64: ; VI: ; %bb.0: @@ -55050,7 +113369,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB35_2 +; VI-NEXT: s_cbranch_execz .LBB70_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v33, 3 ; VI-NEXT: v_add_u16_e32 v32, 3, v15 @@ -55150,7 +113469,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v32, v16, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB35_2: ; %end +; VI-NEXT: .LBB70_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -55165,7 +113484,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB35_2 +; GFX9-NEXT: s_cbranch_execz .LBB70_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -55200,7 +113519,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB35_2: ; %end +; GFX9-NEXT: .LBB70_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -55217,7 +113536,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB35_2 +; GFX11-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -55252,7 +113571,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB35_2: ; %end +; GFX11-NEXT: .LBB70_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -55273,1249 +113592,2254 @@ end: ret <16 x i64> %phi } +define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i16_to_v16i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v56, v10 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB71_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v10, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v16 +; SI-NEXT: v_or_b32_e32 v16, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v17, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v19, v0, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_or_b32_e32 v20, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v21, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v22, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_or_b32_e32 v23, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 +; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 +; SI-NEXT: v_mov_b32_e32 v26, v27 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_or_b32_e32 v28, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_or_b32_e32 v29, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v30, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_or_b32_e32 v8, v1, v55 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB71_3 +; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB71_4: +; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_mov_b32_e32 v56, v16 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB71_2 +; +; VI-LABEL: bitcast_v64i16_to_v16i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v32, s30, 0 +; VI-NEXT: v_writelane_b32 v32, s31, 1 +; VI-NEXT: v_writelane_b32 v32, s34, 2 +; VI-NEXT: v_writelane_b32 v32, s35, 3 +; VI-NEXT: v_writelane_b32 v32, s36, 4 +; VI-NEXT: v_writelane_b32 v32, s37, 5 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_writelane_b32 v32, s38, 6 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: v_readfirstlane_b32 s46, v3 +; VI-NEXT: v_readfirstlane_b32 s45, v4 +; VI-NEXT: v_readfirstlane_b32 s44, v5 +; VI-NEXT: v_readfirstlane_b32 s43, v6 +; VI-NEXT: v_readfirstlane_b32 s42, v7 +; VI-NEXT: v_readfirstlane_b32 s41, v8 +; VI-NEXT: v_readfirstlane_b32 s40, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 +; VI-NEXT: v_readfirstlane_b32 s13, v12 +; VI-NEXT: v_readfirstlane_b32 s12, v13 +; VI-NEXT: v_readfirstlane_b32 s11, v14 +; VI-NEXT: v_readfirstlane_b32 s10, v15 +; VI-NEXT: v_readfirstlane_b32 s9, v16 +; VI-NEXT: v_readfirstlane_b32 s8, v17 +; VI-NEXT: v_readfirstlane_b32 s7, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s6, v1 +; VI-NEXT: v_writelane_b32 v32, s39, 7 +; VI-NEXT: s_cbranch_scc0 .LBB71_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB71_3 +; VI-NEXT: .LBB71_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s47, 3 +; VI-NEXT: s_and_b32 s47, s46, 0xffff0000 +; VI-NEXT: s_add_i32 s46, s46, 3 +; VI-NEXT: s_and_b32 s56, s45, 0xffff0000 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_and_b32 s57, s44, 0xffff0000 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_and_b32 s58, s43, 0xffff0000 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_and_b32 s59, s42, 0xffff0000 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_and_b32 s60, s41, 0xffff0000 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_and_b32 s61, s40, 0xffff0000 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_and_b32 s62, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_and_b32 s63, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_and_b32 s72, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_and_b32 s73, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_and_b32 s74, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_and_b32 s75, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_and_b32 s76, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s77, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_and_b32 s78, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s79, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s88, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s89, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s90, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s91, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_and_b32 vcc_lo, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_and_b32 vcc_hi, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_and_b32 s30, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s31, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s34, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s35, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s36, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s37, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s38, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s39, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s40, s40, 0xffff +; VI-NEXT: s_and_b32 s41, s41, 0xffff +; VI-NEXT: s_and_b32 s42, s42, 0xffff +; VI-NEXT: s_and_b32 s43, s43, 0xffff +; VI-NEXT: s_and_b32 s44, s44, 0xffff +; VI-NEXT: s_and_b32 s45, s45, 0xffff +; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s6, s39, s6 +; VI-NEXT: s_or_b32 s7, s38, s7 +; VI-NEXT: s_or_b32 s29, s37, s29 +; VI-NEXT: s_or_b32 s28, s36, s28 +; VI-NEXT: s_or_b32 s27, s35, s27 +; VI-NEXT: s_or_b32 s26, s34, s26 +; VI-NEXT: s_or_b32 s25, s31, s25 +; VI-NEXT: s_or_b32 s24, s30, s24 +; VI-NEXT: s_or_b32 s23, vcc_hi, s23 +; VI-NEXT: s_or_b32 s22, vcc_lo, s22 +; VI-NEXT: s_or_b32 s21, s91, s21 +; VI-NEXT: s_or_b32 s20, s90, s20 +; VI-NEXT: s_or_b32 s19, s89, s19 +; VI-NEXT: s_or_b32 s18, s88, s18 +; VI-NEXT: s_or_b32 s17, s79, s17 +; VI-NEXT: s_or_b32 s16, s78, s16 +; VI-NEXT: s_or_b32 s8, s77, s8 +; VI-NEXT: s_or_b32 s9, s76, s9 +; VI-NEXT: s_or_b32 s10, s75, s10 +; VI-NEXT: s_or_b32 s11, s74, s11 +; VI-NEXT: s_or_b32 s12, s73, s12 +; VI-NEXT: s_or_b32 s13, s72, s13 +; VI-NEXT: s_or_b32 s14, s63, s14 +; VI-NEXT: s_or_b32 s15, s62, s15 +; VI-NEXT: s_or_b32 s40, s61, s40 +; VI-NEXT: s_or_b32 s41, s60, s41 +; VI-NEXT: s_or_b32 s42, s59, s42 +; VI-NEXT: s_or_b32 s43, s58, s43 +; VI-NEXT: s_or_b32 s44, s57, s44 +; VI-NEXT: s_or_b32 s45, s56, s45 +; VI-NEXT: s_or_b32 s46, s47, s46 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_add_i32 s25, s25, 0x30000 +; VI-NEXT: s_add_i32 s24, s24, 0x30000 +; VI-NEXT: s_add_i32 s23, s23, 0x30000 +; VI-NEXT: s_add_i32 s22, s22, 0x30000 +; VI-NEXT: s_add_i32 s21, s21, 0x30000 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s40, s40, 0x30000 +; VI-NEXT: s_add_i32 s41, s41, 0x30000 +; VI-NEXT: s_add_i32 s42, s42, 0x30000 +; VI-NEXT: s_add_i32 s43, s43, 0x30000 +; VI-NEXT: s_add_i32 s44, s44, 0x30000 +; VI-NEXT: s_add_i32 s45, s45, 0x30000 +; VI-NEXT: s_add_i32 s46, s46, 0x30000 +; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: .LBB71_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s7 +; VI-NEXT: v_mov_b32_e32 v15, s6 +; VI-NEXT: v_mov_b32_e32 v16, s47 +; VI-NEXT: v_mov_b32_e32 v17, s46 +; VI-NEXT: v_mov_b32_e32 v18, s45 +; VI-NEXT: v_mov_b32_e32 v19, s44 +; VI-NEXT: v_mov_b32_e32 v20, s43 +; VI-NEXT: v_mov_b32_e32 v21, s42 +; VI-NEXT: v_mov_b32_e32 v22, s41 +; VI-NEXT: v_mov_b32_e32 v23, s40 +; VI-NEXT: v_mov_b32_e32 v24, s15 +; VI-NEXT: v_mov_b32_e32 v25, s14 +; VI-NEXT: v_mov_b32_e32 v26, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v28, s11 +; VI-NEXT: v_mov_b32_e32 v29, s10 +; VI-NEXT: v_mov_b32_e32 v30, s9 +; VI-NEXT: v_mov_b32_e32 v31, s8 +; VI-NEXT: v_readlane_b32 s39, v32, 7 +; VI-NEXT: v_readlane_b32 s38, v32, 6 +; VI-NEXT: v_readlane_b32 s37, v32, 5 +; VI-NEXT: v_readlane_b32 s36, v32, 4 +; VI-NEXT: v_readlane_b32 s35, v32, 3 +; VI-NEXT: v_readlane_b32 s34, v32, 2 +; VI-NEXT: v_readlane_b32 s31, v32, 1 +; VI-NEXT: v_readlane_b32 s30, v32, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB71_4: +; VI-NEXT: s_branch .LBB71_2 +; +; GFX9-LABEL: bitcast_v64i16_to_v16i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB71_3 +; GFX9-NEXT: .LBB71_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB71_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB71_4: +; GFX9-NEXT: s_branch .LBB71_2 +; +; GFX11-LABEL: bitcast_v64i16_to_v16i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-NEXT: .LBB71_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB71_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB71_4: +; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: s_branch .LBB71_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f64_to_v128i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v51, v8, v7, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v4, v3, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v53, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v61, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v62, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v63, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v2 -; GCN-NEXT: .LBB36_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v51, v8, v7, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v4, v3, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v53, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v61, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v62, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v63, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v2 -; GCN-NEXT: .LBB36_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v63 -; GCN-NEXT: v_or_b32_e32 v1, v1, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GCN-NEXT: v_or_b32_e32 v2, v2, v35 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v35, v36, v35 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v33, v33, v34 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v35 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v63, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v62, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v61, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v27 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v29 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v30 -; GCN-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v53 -; GCN-NEXT: v_or_b32_e32 v48, v48, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v60 -; GCN-NEXT: v_or_b32_e32 v54, v54, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v52 -; GCN-NEXT: v_or_b32_e32 v60, v50, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v59 -; GCN-NEXT: v_or_b32_e32 v59, v49, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v51 -; GCN-NEXT: v_or_b32_e32 v32, v39, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v58 -; GCN-NEXT: v_or_b32_e32 v30, v38, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GCN-NEXT: v_or_b32_e32 v28, v37, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v57 -; GCN-NEXT: v_or_b32_e32 v26, v36, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GCN-NEXT: v_or_b32_e32 v24, v63, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v56 -; GCN-NEXT: v_or_b32_e32 v22, v62, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GCN-NEXT: v_or_b32_e32 v20, v61, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v47 -; GCN-NEXT: v_or_b32_e32 v16, v35, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GCN-NEXT: v_or_b32_e32 v15, v15, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v46 -; GCN-NEXT: v_or_b32_e32 v17, v34, v17 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GCN-NEXT: v_or_b32_e32 v19, v33, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v45 -; GCN-NEXT: v_or_b32_e32 v21, v18, v21 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GCN-NEXT: v_or_b32_e32 v23, v14, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v44 -; GCN-NEXT: v_or_b32_e32 v25, v13, v14 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GCN-NEXT: v_or_b32_e32 v27, v12, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v43 -; GCN-NEXT: v_or_b32_e32 v29, v11, v12 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GCN-NEXT: v_or_b32_e32 v31, v10, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v42 -; GCN-NEXT: v_or_b32_e32 v33, v9, v10 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GCN-NEXT: v_or_b32_e32 v35, v8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v41 -; GCN-NEXT: v_or_b32_e32 v52, v7, v8 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GCN-NEXT: v_or_b32_e32 v50, v6, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v40 -; GCN-NEXT: v_or_b32_e32 v49, v5, v6 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GCN-NEXT: v_or_b32_e32 v51, v4, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v55 -; GCN-NEXT: v_or_b32_e32 v53, v3, v4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v55, v2, v3 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v41, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v10, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v12, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v5, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v6, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v7, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v8, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v9, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v11, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v13, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v18, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v34, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v36, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v37, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v38, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v39, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v40, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v42, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v43, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v44, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v45, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v46, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v47, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v56, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v57, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v58, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v61, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v62, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v63, v4, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v3, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v54, v1, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v60 -; GCN-NEXT: v_or_b32_e32 v4, v2, v10 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v59 -; GCN-NEXT: v_or_b32_e32 v59, v10, v12 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v32 -; GCN-NEXT: v_or_b32_e32 v5, v10, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v60, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v9, v9, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v11, v11, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v13, v13, v18 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v15, v15, v34 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v17, v17, v36 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v19, v37 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v21, v38 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v23, v23, v39 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v25, v25, v40 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v27, v27, v42 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v43 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v31, v31, v44 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GCN-NEXT: v_or_b32_e32 v33, v33, v45 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GCN-NEXT: v_or_b32_e32 v35, v35, v46 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v37, v37, v47 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v39, v39, v56 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v49, v49, v57 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v51, v51, v58 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v53, v53, v61 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff, v55 -; GCN-NEXT: v_or_b32_e32 v55, v55, v62 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 -; GCN-NEXT: v_or_b32_e32 v41, v41, v63 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v60, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f64_to_v128i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB72_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v32, v31, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v32, v31, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v32, v31, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v39, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v52, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v40, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v42, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; SI-NEXT: .LBB72_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB72_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_alignbit_b32 v33, v32, v31, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v32, v31, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v32, v31, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v39, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v52, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v40, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v42, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; SI-NEXT: .LBB72_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v46 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v42 +; SI-NEXT: v_or_b32_e32 v42, v42, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v42 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v61 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v58 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v47 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v41 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v17 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v128i8: ; VI: ; %bb.0: @@ -56717,7 +116041,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: s_cbranch_execz .LBB72_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill @@ -56893,9 +116217,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v32 -; VI-NEXT: .LBB36_2: ; %Flow +; VI-NEXT: .LBB72_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB36_4 +; VI-NEXT: s_cbranch_execz .LBB72_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; VI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 @@ -57087,7 +116411,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v1 -; VI-NEXT: .LBB36_4: ; %end +; VI-NEXT: .LBB72_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v54 ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v55 @@ -57683,7 +117007,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: s_cbranch_execz .LBB72_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 @@ -57879,9 +117203,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] -; GFX9-NEXT: .LBB36_2: ; %Flow +; GFX9-NEXT: .LBB72_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB36_4 +; GFX9-NEXT: s_cbranch_execz .LBB72_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(30) ; GFX9-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 @@ -58092,7 +117416,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v1 -; GFX9-NEXT: .LBB36_4: ; %end +; GFX9-NEXT: .LBB72_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -58542,7 +117866,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] @@ -58609,9 +117933,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 -; GFX11-TRUE16-NEXT: .LBB36_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB72_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -58694,7 +118018,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 -; GFX11-TRUE16-NEXT: .LBB36_4: ; %end +; GFX11-TRUE16-NEXT: .LBB72_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -59143,7 +118467,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -59242,9 +118566,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] -; GFX11-FAKE16-NEXT: .LBB36_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB72_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB36_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB72_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -59359,7 +118683,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 -; GFX11-FAKE16-NEXT: .LBB36_4: ; %end +; GFX11-FAKE16-NEXT: .LBB72_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 @@ -59701,1621 +119025,7397 @@ end: ret <128 x i8> %phi } +define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f64_to_v128i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_mov_b32_e32 v31, s16 +; SI-NEXT: v_mov_b32_e32 v32, s17 +; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: v_mov_b32_e32 v30, s19 +; SI-NEXT: v_mov_b32_e32 v27, s20 +; SI-NEXT: v_mov_b32_e32 v28, s21 +; SI-NEXT: v_mov_b32_e32 v25, s22 +; SI-NEXT: v_mov_b32_e32 v26, s23 +; SI-NEXT: v_mov_b32_e32 v23, s24 +; SI-NEXT: v_mov_b32_e32 v24, s25 +; SI-NEXT: v_mov_b32_e32 v21, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_mov_b32_e32 v20, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB73_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v4, v3, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v4, v3, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v4, v3, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 +; SI-NEXT: v_alignbit_b32 v38, v28, v27, 24 +; SI-NEXT: v_alignbit_b32 v48, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v50, v28, v27, 8 +; SI-NEXT: v_alignbit_b32 v52, v30, v29, 24 +; SI-NEXT: v_alignbit_b32 v54, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v40, v30, v29, 8 +; SI-NEXT: v_alignbit_b32 v42, v32, v31, 24 +; SI-NEXT: v_alignbit_b32 v44, v32, v31, 16 +; SI-NEXT: v_alignbit_b32 v46, v32, v31, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v24 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v26 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v26 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v30 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v32 +; SI-NEXT: s_cbranch_execnz .LBB73_3 +; SI-NEXT: .LBB73_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 24 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v4, v3, 24 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v4, v3, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v4, v3, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 24 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 +; SI-NEXT: v_alignbit_b32 v38, v28, v27, 24 +; SI-NEXT: v_alignbit_b32 v48, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v50, v28, v27, 8 +; SI-NEXT: v_alignbit_b32 v52, v30, v29, 24 +; SI-NEXT: v_alignbit_b32 v54, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v40, v30, v29, 8 +; SI-NEXT: v_alignbit_b32 v42, v32, v31, 24 +; SI-NEXT: v_alignbit_b32 v44, v32, v31, 16 +; SI-NEXT: v_alignbit_b32 v46, v32, v31, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v24 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v26 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v26 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v30 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v32 +; SI-NEXT: .LBB73_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v46 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 +; SI-NEXT: v_or_b32_e32 v31, v31, v46 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v42 +; SI-NEXT: v_or_b32_e32 v42, v42, v44 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v42 +; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v35 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v40 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v52 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v63 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v61 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v50 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v38 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v60 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v58 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v57 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v47 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v45 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v41 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v55 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v51 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v49 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v37 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB73_4: +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_branch .LBB73_2 +; +; VI-LABEL: bitcast_v16f64_to_v128i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v63, s30, 0 +; VI-NEXT: v_writelane_b32 v63, s31, 1 +; VI-NEXT: v_writelane_b32 v63, s34, 2 +; VI-NEXT: v_writelane_b32 v63, s35, 3 +; VI-NEXT: v_writelane_b32 v63, s36, 4 +; VI-NEXT: v_writelane_b32 v63, s37, 5 +; VI-NEXT: v_writelane_b32 v63, s38, 6 +; VI-NEXT: v_writelane_b32 v63, s39, 7 +; VI-NEXT: v_writelane_b32 v63, s48, 8 +; VI-NEXT: v_writelane_b32 v63, s49, 9 +; VI-NEXT: v_writelane_b32 v63, s50, 10 +; VI-NEXT: v_writelane_b32 v63, s51, 11 +; VI-NEXT: v_writelane_b32 v63, s52, 12 +; VI-NEXT: v_writelane_b32 v63, s53, 13 +; VI-NEXT: v_writelane_b32 v63, s54, 14 +; VI-NEXT: v_writelane_b32 v63, s55, 15 +; VI-NEXT: v_writelane_b32 v63, s64, 16 +; VI-NEXT: v_writelane_b32 v63, s65, 17 +; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_writelane_b32 v63, s68, 20 +; VI-NEXT: v_writelane_b32 v63, s69, 21 +; VI-NEXT: v_writelane_b32 v63, s70, 22 +; VI-NEXT: v_writelane_b32 v63, s71, 23 +; VI-NEXT: v_writelane_b32 v63, s80, 24 +; VI-NEXT: v_writelane_b32 v63, s81, 25 +; VI-NEXT: v_writelane_b32 v63, s82, 26 +; VI-NEXT: v_writelane_b32 v63, s83, 27 +; VI-NEXT: v_writelane_b32 v63, s84, 28 +; VI-NEXT: v_writelane_b32 v63, s85, 29 +; VI-NEXT: v_writelane_b32 v63, s86, 30 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_writelane_b32 v63, s87, 31 +; VI-NEXT: v_readfirstlane_b32 s6, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: v_readfirstlane_b32 s8, v3 +; VI-NEXT: v_readfirstlane_b32 s9, v4 +; VI-NEXT: v_readfirstlane_b32 s10, v5 +; VI-NEXT: v_readfirstlane_b32 s11, v6 +; VI-NEXT: v_readfirstlane_b32 s12, v7 +; VI-NEXT: v_readfirstlane_b32 s13, v8 +; VI-NEXT: v_readfirstlane_b32 s14, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s40, v11 +; VI-NEXT: v_readfirstlane_b32 s41, v12 +; VI-NEXT: v_readfirstlane_b32 s42, v13 +; VI-NEXT: v_readfirstlane_b32 s43, v14 +; VI-NEXT: v_readfirstlane_b32 s44, v15 +; VI-NEXT: v_readfirstlane_b32 s45, v16 +; VI-NEXT: v_readfirstlane_b32 s4, v17 +; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v18 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; VI-NEXT: s_cbranch_scc0 .LBB73_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s44, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 6 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 7 +; VI-NEXT: s_lshr_b32 s46, s43, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s43, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s43, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s42, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 4 +; VI-NEXT: s_lshr_b32 s46, s42, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 5 +; VI-NEXT: s_lshr_b32 s46, s41, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s41, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s41, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s40, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 2 +; VI-NEXT: s_lshr_b32 s46, s40, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 3 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 0 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 1 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s27, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s25, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s25, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s23, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s23, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s23, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s21, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s19, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s19, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s19, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s17, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s17, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s17, 8 +; VI-NEXT: s_lshr_b32 s81, s12, 16 +; VI-NEXT: s_lshr_b32 s80, s12, 8 +; VI-NEXT: s_lshr_b32 s83, s10, 16 +; VI-NEXT: s_lshr_b32 s82, s10, 8 +; VI-NEXT: s_lshr_b32 s85, s8, 16 +; VI-NEXT: s_lshr_b32 s84, s8, 8 +; VI-NEXT: s_lshr_b32 s51, s6, 16 +; VI-NEXT: s_lshr_b32 s50, s6, 8 +; VI-NEXT: s_lshr_b32 s52, s28, 16 +; VI-NEXT: s_lshr_b32 s86, s28, 8 +; VI-NEXT: s_lshr_b32 s87, s26, 16 +; VI-NEXT: s_lshr_b32 s53, s26, 8 +; VI-NEXT: s_lshr_b32 s55, s24, 16 +; VI-NEXT: s_lshr_b32 s54, s24, 8 +; VI-NEXT: s_lshr_b32 s65, s22, 16 +; VI-NEXT: s_lshr_b32 s64, s22, 8 +; VI-NEXT: s_lshr_b32 s67, s20, 16 +; VI-NEXT: s_lshr_b32 s66, s20, 8 +; VI-NEXT: s_lshr_b32 s69, s18, 16 +; VI-NEXT: s_lshr_b32 s68, s18, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 57 +; VI-NEXT: s_lshr_b32 s71, s16, 16 +; VI-NEXT: s_lshr_b32 s70, s16, 8 +; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[56:57], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB73_4 +; VI-NEXT: .LBB73_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[11:12], s[4:5], 1.0 +; VI-NEXT: v_add_f64 v[1:2], s[44:45], 1.0 +; VI-NEXT: v_add_f64 v[3:4], s[42:43], 1.0 +; VI-NEXT: v_add_f64 v[5:6], s[40:41], 1.0 +; VI-NEXT: v_add_f64 v[7:8], s[14:15], 1.0 +; VI-NEXT: v_add_f64 v[9:10], s[12:13], 1.0 +; VI-NEXT: v_add_f64 v[13:14], s[10:11], 1.0 +; VI-NEXT: v_add_f64 v[15:16], s[8:9], 1.0 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; VI-NEXT: v_add_f64 v[17:18], s[6:7], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] +; VI-NEXT: v_add_f64 v[19:20], s[28:29], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] +; VI-NEXT: v_add_f64 v[21:22], s[26:27], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; VI-NEXT: v_add_f64 v[23:24], s[24:25], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; VI-NEXT: v_add_f64 v[25:26], s[22:23], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; VI-NEXT: v_add_f64 v[27:28], s[20:21], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; VI-NEXT: v_add_f64 v[29:30], s[18:19], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; VI-NEXT: v_add_f64 v[31:32], s[16:17], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v3 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v6 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v6 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v7 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v10 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v10 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v9 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v14 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v13 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v16 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v16 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v15 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v18 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v18 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v18 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v17 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v20 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v20 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v19 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v22 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v22 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v21 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v23 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v53, 24, v24 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v26 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v28 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v28 +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v30 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v30 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v32 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v31 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: s_branch .LBB73_5 +; VI-NEXT: .LBB73_3: +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB73_2 +; VI-NEXT: .LBB73_4: +; VI-NEXT: v_mov_b32_e32 v33, s71 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s69 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s68 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s67 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s66 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s65 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s64 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s55 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s54 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s87 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s53 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s52 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s86 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s51 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s50 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s85 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s84 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s83 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s82 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s81 +; VI-NEXT: v_readlane_b32 s4, v62, 0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s80 +; VI-NEXT: v_mov_b32_e32 v34, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 1 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 2 +; VI-NEXT: v_mov_b32_e32 v38, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 3 +; VI-NEXT: v_mov_b32_e32 v52, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 5 +; VI-NEXT: v_mov_b32_e32 v37, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 6 +; VI-NEXT: v_mov_b32_e32 v43, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 7 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 8 +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 9 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 10 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 11 +; VI-NEXT: v_mov_b32_e32 v41, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 12 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 13 +; VI-NEXT: v_mov_b32_e32 v44, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 14 +; VI-NEXT: v_mov_b32_e32 v51, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 15 +; VI-NEXT: v_mov_b32_e32 v45, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 16 +; VI-NEXT: v_mov_b32_e32 v47, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 17 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 18 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 19 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 20 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 21 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 22 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 23 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 24 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 25 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 26 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 27 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 28 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 29 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 30 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 31 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 32 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 33 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 34 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 35 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 36 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 37 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 38 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 39 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 40 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 41 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 42 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_mov_b32_e32 v40, s48 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s38 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s36 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s34 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s30 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s90 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s88 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s78 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s76 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s74 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s72 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s62 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s60 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s58 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s56 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_readlane_b32 s4, v62, 43 +; VI-NEXT: v_mov_b32_e32 v53, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 44 +; VI-NEXT: v_mov_b32_e32 v56, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 45 +; VI-NEXT: v_mov_b32_e32 v57, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 46 +; VI-NEXT: v_mov_b32_e32 v58, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 47 +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 48 +; VI-NEXT: v_mov_b32_e32 v54, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 49 +; VI-NEXT: v_mov_b32_e32 v48, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 50 +; VI-NEXT: v_mov_b32_e32 v59, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 51 +; VI-NEXT: v_mov_b32_e32 v60, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 52 +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 53 +; VI-NEXT: v_mov_b32_e32 v49, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 54 +; VI-NEXT: v_mov_b32_e32 v61, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 55 +; VI-NEXT: v_mov_b32_e32 v36, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 56 +; VI-NEXT: v_mov_b32_e32 v40, s46 +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 57 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v1, s44 +; VI-NEXT: v_mov_b32_e32 v2, s45 +; VI-NEXT: v_mov_b32_e32 v3, s42 +; VI-NEXT: v_mov_b32_e32 v4, s43 +; VI-NEXT: v_mov_b32_e32 v5, s40 +; VI-NEXT: v_mov_b32_e32 v6, s41 +; VI-NEXT: v_mov_b32_e32 v7, s14 +; VI-NEXT: v_mov_b32_e32 v8, s15 +; VI-NEXT: v_mov_b32_e32 v9, s12 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v13, s10 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v17, s6 +; VI-NEXT: v_mov_b32_e32 v18, s7 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v20, s29 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s25 +; VI-NEXT: v_mov_b32_e32 v25, s22 +; VI-NEXT: v_mov_b32_e32 v26, s23 +; VI-NEXT: v_mov_b32_e32 v27, s20 +; VI-NEXT: v_mov_b32_e32 v28, s21 +; VI-NEXT: v_mov_b32_e32 v29, s18 +; VI-NEXT: v_mov_b32_e32 v30, s19 +; VI-NEXT: v_mov_b32_e32 v31, s16 +; VI-NEXT: v_mov_b32_e32 v32, s17 +; VI-NEXT: v_mov_b32_e32 v42, s70 +; VI-NEXT: v_mov_b32_e32 v50, s4 +; VI-NEXT: v_mov_b32_e32 v40, v43 +; VI-NEXT: v_mov_b32_e32 v46, v38 +; VI-NEXT: v_mov_b32_e32 v38, v34 +; VI-NEXT: .LBB73_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v42 +; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v50 +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v32, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_readlane_b32 s87, v63, 31 +; VI-NEXT: v_readlane_b32 s86, v63, 30 +; VI-NEXT: v_readlane_b32 s85, v63, 29 +; VI-NEXT: v_readlane_b32 s84, v63, 28 +; VI-NEXT: v_readlane_b32 s83, v63, 27 +; VI-NEXT: v_readlane_b32 s82, v63, 26 +; VI-NEXT: v_readlane_b32 s81, v63, 25 +; VI-NEXT: v_readlane_b32 s80, v63, 24 +; VI-NEXT: v_readlane_b32 s71, v63, 23 +; VI-NEXT: v_readlane_b32 s70, v63, 22 +; VI-NEXT: v_readlane_b32 s69, v63, 21 +; VI-NEXT: v_readlane_b32 s68, v63, 20 +; VI-NEXT: v_readlane_b32 s67, v63, 19 +; VI-NEXT: v_readlane_b32 s66, v63, 18 +; VI-NEXT: v_readlane_b32 s65, v63, 17 +; VI-NEXT: v_readlane_b32 s64, v63, 16 +; VI-NEXT: v_readlane_b32 s55, v63, 15 +; VI-NEXT: v_readlane_b32 s54, v63, 14 +; VI-NEXT: v_readlane_b32 s53, v63, 13 +; VI-NEXT: v_readlane_b32 s52, v63, 12 +; VI-NEXT: v_readlane_b32 s51, v63, 11 +; VI-NEXT: v_readlane_b32 s50, v63, 10 +; VI-NEXT: v_readlane_b32 s49, v63, 9 +; VI-NEXT: v_readlane_b32 s48, v63, 8 +; VI-NEXT: v_readlane_b32 s39, v63, 7 +; VI-NEXT: v_readlane_b32 s38, v63, 6 +; VI-NEXT: v_readlane_b32 s37, v63, 5 +; VI-NEXT: v_readlane_b32 s36, v63, 4 +; VI-NEXT: v_readlane_b32 s35, v63, 3 +; VI-NEXT: v_readlane_b32 s34, v63, 2 +; VI-NEXT: v_readlane_b32 s31, v63, 1 +; VI-NEXT: v_readlane_b32 s30, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v34, v50, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v36 +; VI-NEXT: v_or_b32_sdwa v31, v55, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v32, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v31, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v61 +; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v39 +; VI-NEXT: v_or_b32_sdwa v30, v49, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v30, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; VI-NEXT: v_or_b32_sdwa v27, v27, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v29, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v60 +; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v48 +; VI-NEXT: v_or_b32_sdwa v28, v59, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v28, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; VI-NEXT: v_or_b32_sdwa v25, v25, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v27, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v54 +; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v58 +; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v26, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; VI-NEXT: v_or_b32_sdwa v23, v23, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v25, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v57 +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v53 +; VI-NEXT: v_or_b32_sdwa v24, v56, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v24, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; VI-NEXT: v_or_b32_sdwa v21, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v23, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v22, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; VI-NEXT: v_or_b32_sdwa v19, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v21, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v20, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_or_b32_sdwa v17, v17, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v19, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v17, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_or_b32_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; VI-NEXT: v_or_b32_sdwa v10, v13, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_or_b32_sdwa v9, v38, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v52 +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; VI-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v37 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v47 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v33 +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f64_to_v128i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v63, s30, 0 +; GFX9-NEXT: v_writelane_b32 v63, s31, 1 +; GFX9-NEXT: v_writelane_b32 v63, s34, 2 +; GFX9-NEXT: v_writelane_b32 v63, s35, 3 +; GFX9-NEXT: v_writelane_b32 v63, s36, 4 +; GFX9-NEXT: v_writelane_b32 v63, s37, 5 +; GFX9-NEXT: v_writelane_b32 v63, s38, 6 +; GFX9-NEXT: v_writelane_b32 v63, s39, 7 +; GFX9-NEXT: v_writelane_b32 v63, s48, 8 +; GFX9-NEXT: v_writelane_b32 v63, s49, 9 +; GFX9-NEXT: v_writelane_b32 v63, s50, 10 +; GFX9-NEXT: v_writelane_b32 v63, s51, 11 +; GFX9-NEXT: v_writelane_b32 v63, s52, 12 +; GFX9-NEXT: v_writelane_b32 v63, s53, 13 +; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_writelane_b32 v63, s64, 16 +; GFX9-NEXT: v_writelane_b32 v63, s65, 17 +; GFX9-NEXT: v_writelane_b32 v63, s66, 18 +; GFX9-NEXT: v_writelane_b32 v63, s67, 19 +; GFX9-NEXT: v_writelane_b32 v63, s68, 20 +; GFX9-NEXT: v_writelane_b32 v63, s69, 21 +; GFX9-NEXT: v_writelane_b32 v63, s70, 22 +; GFX9-NEXT: v_writelane_b32 v63, s71, 23 +; GFX9-NEXT: v_writelane_b32 v63, s80, 24 +; GFX9-NEXT: v_writelane_b32 v63, s81, 25 +; GFX9-NEXT: v_writelane_b32 v63, s82, 26 +; GFX9-NEXT: v_writelane_b32 v63, s83, 27 +; GFX9-NEXT: v_writelane_b32 v63, s84, 28 +; GFX9-NEXT: v_writelane_b32 v63, s85, 29 +; GFX9-NEXT: v_writelane_b32 v63, s86, 30 +; GFX9-NEXT: v_writelane_b32 v63, s87, 31 +; GFX9-NEXT: v_writelane_b32 v63, s96, 32 +; GFX9-NEXT: v_writelane_b32 v63, s97, 33 +; GFX9-NEXT: v_writelane_b32 v63, s98, 34 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_writelane_b32 v63, s99, 35 +; GFX9-NEXT: v_readfirstlane_b32 s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s7, v2 +; GFX9-NEXT: v_readfirstlane_b32 s8, v3 +; GFX9-NEXT: v_readfirstlane_b32 s9, v4 +; GFX9-NEXT: v_readfirstlane_b32 s10, v5 +; GFX9-NEXT: v_readfirstlane_b32 s11, v6 +; GFX9-NEXT: v_readfirstlane_b32 s12, v7 +; GFX9-NEXT: v_readfirstlane_b32 s13, v8 +; GFX9-NEXT: v_readfirstlane_b32 s14, v9 +; GFX9-NEXT: v_readfirstlane_b32 s15, v10 +; GFX9-NEXT: v_readfirstlane_b32 s40, v11 +; GFX9-NEXT: v_readfirstlane_b32 s41, v12 +; GFX9-NEXT: v_readfirstlane_b32 s42, v13 +; GFX9-NEXT: v_readfirstlane_b32 s43, v14 +; GFX9-NEXT: v_readfirstlane_b32 s44, v15 +; GFX9-NEXT: v_readfirstlane_b32 s45, v16 +; GFX9-NEXT: v_readfirstlane_b32 s4, v17 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v18 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; GFX9-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s5, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 2 +; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 3 +; GFX9-NEXT: s_lshr_b32 s46, s5, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 4 +; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 0 +; GFX9-NEXT: s_lshr_b32 s46, s4, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 1 +; GFX9-NEXT: s_lshr_b32 s46, s45, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 5 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 6 +; GFX9-NEXT: s_lshr_b32 s46, s45, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 7 +; GFX9-NEXT: s_lshr_b32 s46, s43, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 8 +; GFX9-NEXT: s_lshr_b32 s46, s43, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 9 +; GFX9-NEXT: s_lshr_b32 s46, s43, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 10 +; GFX9-NEXT: s_lshr_b32 s46, s41, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 11 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 12 +; GFX9-NEXT: s_lshr_b32 s46, s41, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s15, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s15, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s13, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s11, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s11, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s7, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s7, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s29, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s29, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s27, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s27, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s27, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s25, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s25, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s23, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s23, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s23, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 42 +; GFX9-NEXT: s_lshr_b32 s46, s21, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 43 +; GFX9-NEXT: s_lshr_b32 s46, s19, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 44 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 45 +; GFX9-NEXT: s_lshr_b32 s46, s19, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 46 +; GFX9-NEXT: s_lshr_b32 s46, s17, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 47 +; GFX9-NEXT: s_lshr_b32 s46, s17, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s17, 8 +; GFX9-NEXT: s_lshr_b32 s83, s44, 16 +; GFX9-NEXT: s_lshr_b32 s82, s44, 8 +; GFX9-NEXT: s_lshr_b32 s85, s42, 16 +; GFX9-NEXT: s_lshr_b32 s84, s42, 8 +; GFX9-NEXT: s_lshr_b32 s87, s40, 16 +; GFX9-NEXT: s_lshr_b32 s86, s40, 8 +; GFX9-NEXT: s_lshr_b32 s97, s14, 16 +; GFX9-NEXT: s_lshr_b32 s96, s14, 8 +; GFX9-NEXT: s_lshr_b32 s99, s12, 16 +; GFX9-NEXT: s_lshr_b32 s98, s12, 8 +; GFX9-NEXT: s_lshr_b32 s39, s10, 16 +; GFX9-NEXT: s_lshr_b32 s38, s10, 8 +; GFX9-NEXT: s_lshr_b32 s49, s8, 16 +; GFX9-NEXT: s_lshr_b32 s48, s8, 8 +; GFX9-NEXT: s_lshr_b32 s51, s6, 16 +; GFX9-NEXT: s_lshr_b32 s50, s6, 8 +; GFX9-NEXT: s_lshr_b32 s53, s28, 16 +; GFX9-NEXT: s_lshr_b32 s52, s28, 8 +; GFX9-NEXT: s_lshr_b32 s55, s26, 16 +; GFX9-NEXT: s_lshr_b32 s54, s26, 8 +; GFX9-NEXT: s_lshr_b32 s65, s24, 16 +; GFX9-NEXT: s_lshr_b32 s64, s24, 8 +; GFX9-NEXT: s_lshr_b32 s67, s22, 16 +; GFX9-NEXT: s_lshr_b32 s66, s22, 8 +; GFX9-NEXT: s_lshr_b32 s69, s20, 16 +; GFX9-NEXT: s_lshr_b32 s68, s20, 8 +; GFX9-NEXT: s_lshr_b32 s71, s18, 16 +; GFX9-NEXT: s_lshr_b32 s70, s18, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 49 +; GFX9-NEXT: s_lshr_b32 s81, s16, 16 +; GFX9-NEXT: s_lshr_b32 s80, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB73_4 +; GFX9-NEXT: .LBB73_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[11:12], s[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[1:2], s[44:45], 1.0 +; GFX9-NEXT: v_add_f64 v[3:4], s[42:43], 1.0 +; GFX9-NEXT: v_add_f64 v[5:6], s[40:41], 1.0 +; GFX9-NEXT: v_add_f64 v[7:8], s[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[9:10], s[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], s[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], s[8:9], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] +; GFX9-NEXT: v_add_f64 v[23:24], s[6:7], 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_add_f64 v[27:28], s[28:29], 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_add_f64 v[31:32], s[26:27], 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX9-NEXT: v_add_f64 v[33:34], s[24:25], 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX9-NEXT: v_add_f64 v[35:36], s[22:23], 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: v_add_f64 v[37:38], s[20:21], 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] +; GFX9-NEXT: v_add_f64 v[48:49], s[18:19], 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[19:20] +; GFX9-NEXT: v_add_f64 v[50:51], s[16:17], 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[23:24] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v37 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[27:28] +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v48 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[31:32] +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v20 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[33:34] +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v24 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[35:36] +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v28 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[37:38] +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v32 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[48:49] +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v32 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[50:51] +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v2 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v2 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v1 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v3 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v5 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v20 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v20 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v19 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v27 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v27 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v31 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v23 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v33 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v28 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v35 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 24, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 24, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v36 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v38 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 8, v49 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v50 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: s_branch .LBB73_5 +; GFX9-NEXT: .LBB73_3: +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr99 +; GFX9-NEXT: ; implicit-def: $sgpr96 +; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr87 +; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr85 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: s_branch .LBB73_2 +; GFX9-NEXT: .LBB73_4: +; GFX9-NEXT: v_mov_b32_e32 v15, s81 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s71 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s69 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s67 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s65 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s55 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s53 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s52 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s51 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s50 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s49 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s48 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s39 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s38 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s99 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s98 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s97 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s96 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s87 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s86 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s85 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s84 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s83 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s82 +; GFX9-NEXT: v_readlane_b32 s4, v62, 0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 1 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 2 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 3 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 5 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 15 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 16 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 17 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 18 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 19 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 20 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 21 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 23 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 24 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 25 +; GFX9-NEXT: v_mov_b32_e32 v30, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 26 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 27 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 28 +; GFX9-NEXT: v_mov_b32_e32 v29, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 29 +; GFX9-NEXT: v_mov_b32_e32 v41, s66 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_mov_b32_e32 v40, s36 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s34 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s30 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s94 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s92 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s90 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s88 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s78 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s76 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s74 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s72 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s62 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s60 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s58 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s56 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_readlane_b32 s4, v62, 30 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 31 +; GFX9-NEXT: v_mov_b32_e32 v44, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 32 +; GFX9-NEXT: v_mov_b32_e32 v26, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 33 +; GFX9-NEXT: v_mov_b32_e32 v53, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 34 +; GFX9-NEXT: v_mov_b32_e32 v57, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 35 +; GFX9-NEXT: v_mov_b32_e32 v39, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 36 +; GFX9-NEXT: v_mov_b32_e32 v55, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 37 +; GFX9-NEXT: v_mov_b32_e32 v61, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 38 +; GFX9-NEXT: v_mov_b32_e32 v42, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 39 +; GFX9-NEXT: v_mov_b32_e32 v46, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 40 +; GFX9-NEXT: v_mov_b32_e32 v21, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 41 +; GFX9-NEXT: v_mov_b32_e32 v47, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 42 +; GFX9-NEXT: v_mov_b32_e32 v16, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 43 +; GFX9-NEXT: v_mov_b32_e32 v18, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 44 +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 45 +; GFX9-NEXT: v_mov_b32_e32 v58, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 46 +; GFX9-NEXT: v_mov_b32_e32 v22, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 47 +; GFX9-NEXT: v_mov_b32_e32 v59, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 48 +; GFX9-NEXT: v_mov_b32_e32 v60, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 49 +; GFX9-NEXT: v_mov_b32_e32 v40, s46 +; GFX9-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s44 +; GFX9-NEXT: v_mov_b32_e32 v2, s45 +; GFX9-NEXT: v_mov_b32_e32 v3, s42 +; GFX9-NEXT: v_mov_b32_e32 v4, s43 +; GFX9-NEXT: v_mov_b32_e32 v5, s40 +; GFX9-NEXT: v_mov_b32_e32 v6, s41 +; GFX9-NEXT: v_mov_b32_e32 v7, s14 +; GFX9-NEXT: v_mov_b32_e32 v8, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: v_mov_b32_e32 v10, s13 +; GFX9-NEXT: v_mov_b32_e32 v13, s10 +; GFX9-NEXT: v_mov_b32_e32 v14, s11 +; GFX9-NEXT: v_mov_b32_e32 v19, s8 +; GFX9-NEXT: v_mov_b32_e32 v20, s9 +; GFX9-NEXT: v_mov_b32_e32 v23, s6 +; GFX9-NEXT: v_mov_b32_e32 v24, s7 +; GFX9-NEXT: v_mov_b32_e32 v27, s28 +; GFX9-NEXT: v_mov_b32_e32 v28, s29 +; GFX9-NEXT: v_mov_b32_e32 v31, s26 +; GFX9-NEXT: v_mov_b32_e32 v32, s27 +; GFX9-NEXT: v_mov_b32_e32 v33, s24 +; GFX9-NEXT: v_mov_b32_e32 v34, s25 +; GFX9-NEXT: v_mov_b32_e32 v35, s22 +; GFX9-NEXT: v_mov_b32_e32 v36, s23 +; GFX9-NEXT: v_mov_b32_e32 v37, s20 +; GFX9-NEXT: v_mov_b32_e32 v38, s21 +; GFX9-NEXT: v_mov_b32_e32 v48, s18 +; GFX9-NEXT: v_mov_b32_e32 v49, s19 +; GFX9-NEXT: v_mov_b32_e32 v50, s16 +; GFX9-NEXT: v_mov_b32_e32 v51, s17 +; GFX9-NEXT: v_mov_b32_e32 v56, s80 +; GFX9-NEXT: v_mov_b32_e32 v45, s70 +; GFX9-NEXT: v_mov_b32_e32 v43, s68 +; GFX9-NEXT: v_mov_b32_e32 v54, s64 +; GFX9-NEXT: v_mov_b32_e32 v52, s54 +; GFX9-NEXT: v_mov_b32_e32 v25, s4 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB73_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v36, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v33, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v34, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v25, v51, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v48, v48, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v50, v50, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v49, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v37, v37, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v58, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v35, v35, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_readlane_b32 s99, v63, 35 +; GFX9-NEXT: v_readlane_b32 s98, v63, 34 +; GFX9-NEXT: v_readlane_b32 s97, v63, 33 +; GFX9-NEXT: v_readlane_b32 s96, v63, 32 +; GFX9-NEXT: v_readlane_b32 s87, v63, 31 +; GFX9-NEXT: v_readlane_b32 s86, v63, 30 +; GFX9-NEXT: v_readlane_b32 s85, v63, 29 +; GFX9-NEXT: v_readlane_b32 s84, v63, 28 +; GFX9-NEXT: v_readlane_b32 s83, v63, 27 +; GFX9-NEXT: v_readlane_b32 s82, v63, 26 +; GFX9-NEXT: v_readlane_b32 s81, v63, 25 +; GFX9-NEXT: v_readlane_b32 s80, v63, 24 +; GFX9-NEXT: v_readlane_b32 s71, v63, 23 +; GFX9-NEXT: v_readlane_b32 s70, v63, 22 +; GFX9-NEXT: v_readlane_b32 s69, v63, 21 +; GFX9-NEXT: v_readlane_b32 s68, v63, 20 +; GFX9-NEXT: v_readlane_b32 s67, v63, 19 +; GFX9-NEXT: v_readlane_b32 s66, v63, 18 +; GFX9-NEXT: v_readlane_b32 s65, v63, 17 +; GFX9-NEXT: v_readlane_b32 s64, v63, 16 +; GFX9-NEXT: v_readlane_b32 s55, v63, 15 +; GFX9-NEXT: v_readlane_b32 s54, v63, 14 +; GFX9-NEXT: v_readlane_b32 s53, v63, 13 +; GFX9-NEXT: v_readlane_b32 s52, v63, 12 +; GFX9-NEXT: v_readlane_b32 s51, v63, 11 +; GFX9-NEXT: v_readlane_b32 s50, v63, 10 +; GFX9-NEXT: v_readlane_b32 s49, v63, 9 +; GFX9-NEXT: v_readlane_b32 s48, v63, 8 +; GFX9-NEXT: v_readlane_b32 s39, v63, 7 +; GFX9-NEXT: v_readlane_b32 s38, v63, 6 +; GFX9-NEXT: v_readlane_b32 s37, v63, 5 +; GFX9-NEXT: v_readlane_b32 s36, v63, 4 +; GFX9-NEXT: v_readlane_b32 s35, v63, 3 +; GFX9-NEXT: v_readlane_b32 s34, v63, 2 +; GFX9-NEXT: v_readlane_b32 s31, v63, 1 +; GFX9-NEXT: v_readlane_b32 s30, v63, 0 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v29, v19, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v30 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v51 +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v25, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v49 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v19, v25, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v37, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v35, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v42 +; GFX9-NEXT: v_or_b32_sdwa v16, v46, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v33, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v16, v55, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v31, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v26 +; GFX9-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v32, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v27, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v28, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v20, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v13, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v16f64_to_v128i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s30, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s96, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s97, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s98, 2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s99, 3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s100, 4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s101, 5 +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_clause 0x11 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s102, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s103, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s104, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s50, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s51, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s52, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s53, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s54, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s55, 15 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s64, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s65, 17 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s66, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s67, 19 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s68, 20 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s69, 21 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s70, 22 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s71, 23 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s80, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s81, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s82, 26 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s83, 27 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s84, 28 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s85, 29 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s86, 30 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s87, 31 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s40, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s14, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s12, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s10, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s10, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s8, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s6, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s4, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s4, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s28, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s24, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s22, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s18, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s18, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s16, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s3, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s1, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s0, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 26 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[12:13], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[10:11], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 25 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[4:5], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[28:29], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 19 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 11 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[42:43], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB73_4 +; GFX11-TRUE16-NEXT: .LBB73_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], s[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[50:51], s[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], s[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[3:4], s[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[54:55], s[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[5:6], s[40:41], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], s[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], s[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], s[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], s[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[23:24], s[28:29], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[27:28], s[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[31:32], s[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[37:38], s[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[66:67], s[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[50:51] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[54:55] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[31:32] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[37:38] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[66:67] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 8, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 24, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 16, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 8, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 24, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 16, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 16, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 24, v67 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 16, v67 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v67 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 16, v66 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v66 +; GFX11-TRUE16-NEXT: s_branch .LBB73_5 +; GFX11-TRUE16-NEXT: .LBB73_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 10 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 11 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 12 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 13 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 14 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 15 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 17 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 18 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 19 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 20 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 21 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 22 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 23 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 24 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 25 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 26 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 27 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 28 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 29 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 30 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 31 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 8 +; GFX11-TRUE16-NEXT: s_branch .LBB73_2 +; GFX11-TRUE16-NEXT: .LBB73_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, s34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, s103 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, s104 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, s101 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, s102 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.l, s99 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.l, s100 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.l, s97 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 30 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.l, s98 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.l, s87 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.l, s96 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, s85 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 29 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, s86 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, s83 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, s84 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, s81 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 28 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, s82 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, s71 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, s80 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.l, s69 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, s70 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.l, s67 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, s68 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, s65 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, s66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, s55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s64 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s52 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, s39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s30 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s92 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s90 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s88 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s78 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s76 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s74 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s72 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s62 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 18 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 17 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 16 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 14 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 12 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 10 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, s0 +; GFX11-TRUE16-NEXT: .LBB73_5: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 8, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xff, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v66, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xff, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v47, 8, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v43, 0xff, v43 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, v83, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v73 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v45, v54, v45 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, v43, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v43, 0xff, v61 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v67, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xff, v72 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v183, 0xff, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, v66, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, v83, v47 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v47, 8, v60 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v80 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v62 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, v183, v80 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, v66, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v45 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, v82, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, v43, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v56, v66, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v50, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v59 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xff, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v57 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v57, v66, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v51, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v82, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xff, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v180 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, v50, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v82, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xff, v44 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 8, v42 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v37, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, v50, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v38, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v70, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v167 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v40 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v67, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v70 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, v37, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, v38, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v182 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xff, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v177 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v176 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v38, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v66, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v69, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[54:57], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[80:83], off offset:16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v66, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v67, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v165 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xff, v164 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 8, v163 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v147 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v38, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v54, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v27, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v160 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xff, v151 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 8, v150 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xff, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v50, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v54, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v66, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v31, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v32, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v68, v27, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v28, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, v23, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v145 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v144 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v19, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v32, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v130 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v116 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xff, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v128 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v118 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v117 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v35, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v38, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, v23, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, v19, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, v20, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v15, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v103 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v102 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v112 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v23, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v101 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v99 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v19, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v24, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, v13, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, v14, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v9, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v19, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v20, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v19, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v20, v17 +; GFX11-TRUE16-NEXT: s_clause 0x5 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[66:69], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[80:83], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[31:34], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 +; GFX11-TRUE16-NEXT: s_clause 0x11 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:68 +; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v75, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v75, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v75, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v75, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v75, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v75, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v75, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v75, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v75, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v74, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v74, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v74, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v74, 28 +; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v74, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v74, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v74, 25 +; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v74, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v74, 23 +; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v74, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v74, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v74, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v74, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v74, 18 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v74, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v74, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v74, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v74, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v74, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v74, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v74, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v74, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v74, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v74, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v74, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v74, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v74, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v74, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v74, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v74, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v74, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v74, 0 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:92 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s30, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s96, 0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s97, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s98, 2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s99, 3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s100, 4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s40, v13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s101, 5 +; GFX11-FAKE16-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s102, 6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s103, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s104, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s49, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s50, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s51, 11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s52, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s53, 13 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s54, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s55, 15 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s64, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s65, 17 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s66, 18 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s67, 19 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s68, 20 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s69, 21 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s70, 22 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s71, 23 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s80, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s81, 25 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s82, 26 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s83, 27 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s84, 28 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s85, 29 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s86, 30 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s87, 31 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s41, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s41, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s41, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s40, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s40, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s15, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s15, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s15, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s14, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s13, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s12, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s10, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s8, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s8, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s6, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s4, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s28, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s28, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s26, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s22, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s20, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s16, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s2, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[12:13], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[10:11], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[4:5], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[28:29], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB73_4 +; GFX11-FAKE16-NEXT: .LBB73_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[23:24], s[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], s[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[32:33], s[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], s[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[36:37], s[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[52:53], s[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[1:2], s[40:41], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], s[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], s[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], s[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], s[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], s[28:29], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], s[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[48:49], s[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[64:65], s[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[28:29] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[32:33] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[36:37] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[52:53] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[48:49] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[64:65] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 24, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 24, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 24, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 24, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 16, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 8, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 8, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v65 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v65 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v65 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v64 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v64 +; GFX11-FAKE16-NEXT: s_branch .LBB73_5 +; GFX11-FAKE16-NEXT: .LBB73_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: s_branch .LBB73_2 +; GFX11-FAKE16-NEXT: .LBB73_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s0 :: v_dual_mov_b32 v65, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s40 :: v_dual_mov_b32 v2, s41 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s14 :: v_dual_mov_b32 v4, s15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v87, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v39, s54 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s12 :: v_dual_mov_b32 v6, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s10 :: v_dual_mov_b32 v8, s11 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v96, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s8 :: v_dual_mov_b32 v10, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s6 :: v_dual_mov_b32 v12, s7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v99, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v55, s53 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s4 :: v_dual_mov_b32 v14, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v100, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s26 :: v_dual_mov_b32 v20, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s24 :: v_dual_mov_b32 v24, s25 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v101, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s51 :: v_dual_mov_b32 v28, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, s23 :: v_dual_mov_b32 v32, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s21 :: v_dual_mov_b32 v112, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s49 :: v_dual_mov_b32 v36, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, s19 :: v_dual_mov_b32 v48, s16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v49, s17 :: v_dual_mov_b32 v114, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s48 :: v_dual_mov_b32 v52, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, s3 :: v_dual_mov_b32 v44, s35 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v41, s104 :: v_dual_mov_b32 v116, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s34 :: v_dual_mov_b32 v43, s103 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v181, s102 :: v_dual_mov_b32 v182, s101 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v119, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v51, s39 :: v_dual_mov_b32 v176, s100 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v177, s99 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v163, s98 :: v_dual_mov_b32 v160, s96 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v128, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v165, s97 :: v_dual_mov_b32 v148, s86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v161, s87 :: v_dual_mov_b32 v144, s83 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v129, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 11 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v71, s38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v149, s85 :: v_dual_mov_b32 v130, s82 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v135, s84 :: v_dual_mov_b32 v118, s71 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v132, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v131, s81 :: v_dual_mov_b32 v102, s68 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v117, s80 :: v_dual_mov_b32 v98, s65 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v133, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v83, s37 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v113, s70 :: v_dual_mov_b32 v84, s64 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v115, s69 :: v_dual_mov_b32 v86, s55 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v134, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v103, s67 :: v_dual_mov_b32 v18, s52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s66 :: v_dual_mov_b32 v22, s50 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v145, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 15 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v85, s36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v81, s42 :: v_dual_mov_b32 v38, s90 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v69, s56 :: v_dual_mov_b32 v34, s88 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v146, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v67, s60 :: v_dual_mov_b32 v30, s78 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s76 :: v_dual_mov_b32 v25, s74 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v147, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 17 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v21, s72 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s62 :: v_dual_mov_b32 v80, s44 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v70, s46 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v150, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 18 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v68, s58 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v66, s30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v54, s94 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s92 :: v_dual_mov_b32 v151, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 19 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v162, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v164, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 21 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v166, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v167, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 23 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v178, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v179, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v180, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v183, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v40, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 28 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v42, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 29 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v45, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v47, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 31 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v56, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v57, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v58, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v59, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v60, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v61, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v62, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v63, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v72, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v73, s0 +; GFX11-FAKE16-NEXT: .LBB73_5: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v64, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v41, 0xff, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, v82, v81 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v52, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, v41, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, v65, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v72 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v72, v64, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, v82, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xff, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v53, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v73, v64, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, v82, v41 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v74, v52, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, v48, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v81, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v59 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xff, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v75, v52, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, v49, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v70, v80 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v43, v48, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v65, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v44, v48, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v37, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, v64, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v163 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v53, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v45, v36, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v46, v37, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v40 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v160 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v179 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v178 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v37, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, v52, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, v65, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, v32, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, v33, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v28, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v70, v29, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v166 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v36, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v135 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v151 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 8, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v37, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v52, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v148, v23, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v150, v19, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v151, v20, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v130, v15, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v146 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v149, v24, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v113 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v33, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v131, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v132, v13, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v133, v14, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v102 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v19, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v97 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v84 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v26, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v10, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v5, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v87 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v71 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v18, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v21, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v5, v6 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[72:75], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[43:46], off offset:16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v17 +; GFX11-FAKE16-NEXT: s_clause 0x5 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[67:70], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[148:151], off offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[130:133], off offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 +; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:76 +; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v77, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v77, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v77, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v77, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v77, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v77, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v77, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v77, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v77, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v76, 31 +; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v76, 30 +; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v76, 29 +; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v76, 28 +; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v76, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v76, 26 +; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v76, 25 +; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v76, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v76, 23 +; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v76, 22 +; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v76, 21 +; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v76, 20 +; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v76, 19 +; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v76, 18 +; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v76, 17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v76, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v76, 15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v76, 14 +; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v76, 13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v76, 12 +; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v76, 11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v76, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v76, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v76, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v76, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v76, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v76, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v76, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v76, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v76, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v76, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v76, 0 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:92 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v128i8_to_v16f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:388 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v60, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v58, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v62, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v61, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v34, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v1 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:372 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB37_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v39 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v56 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v47 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v54 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v46 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v37 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v48 -; GCN-NEXT: v_or_b32_e32 v8, v8, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v9, v53 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v10, v42 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v41 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v40 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v63 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v50 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v18, v51 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v19, v49 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v20, v60 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v21, v58 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v22, v22, v62 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_or_b32_e32 v23, v23, v59 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_or_b32_e32 v25, v25, v61 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_or_b32_e32 v26, v26, v34 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v27, v27, v52 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_or_b32_e32 v28, v28, v35 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v33 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v30, v30, v44 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v57 -; GCN-NEXT: v_or_b32_e32 v31, v31, v36 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v36, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v37, v38, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v48, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v55, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v54, v49 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v50 -; GCN-NEXT: v_or_b32_e32 v19, v19, v51 -; GCN-NEXT: v_or_b32_e32 v20, v20, v52 -; GCN-NEXT: v_or_b32_e32 v21, v21, v53 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v23, v23, v33 -; GCN-NEXT: v_or_b32_e32 v24, v24, v34 -; GCN-NEXT: v_or_b32_e32 v25, v25, v35 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v37 -; GCN-NEXT: v_or_b32_e32 v28, v28, v38 -; GCN-NEXT: v_or_b32_e32 v29, v29, v39 -; GCN-NEXT: v_or_b32_e32 v30, v30, v48 -; GCN-NEXT: v_or_b32_e32 v31, v31, v49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB37_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB37_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v39, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v56, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v47, v3 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v54, v4 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v46, v5 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v37, v6 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v45, v8 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v53, v9 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v42, v10 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v41, v11 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v40, v12 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v63, v13 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v50, v14 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v0, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v0, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v0, v17 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v51, v18 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v49, v19 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v60, v20 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v58, v21 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v25, v62, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v29, v59, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v37, v32, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v50, v61, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v41, v34, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v45, v52, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v56, v35, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v58, v33, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v59, v44, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v57 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v57, v36, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v60, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v61, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v54, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v32, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v48, v35 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v53, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_mov_b32_e32 v0, v55 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v55, v53 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v55, v40, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v40, v42, v40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v44, v43 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v44, v46, v44 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v0, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v47, v0, v47 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v60, v0 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v61, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v63, v3 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v21 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s7, v37 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s7, v50 -; GCN-NEXT: v_add_i32_e32 v41, vcc, s7, v41 -; GCN-NEXT: v_add_i32_e32 v45, vcc, s7, v45 -; GCN-NEXT: v_add_i32_e32 v56, vcc, s7, v56 -; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v59 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x300, v57 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v56 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 -; GCN-NEXT: v_or_b32_e32 v4, v36, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v39, v6 -; GCN-NEXT: v_or_b32_e32 v7, v49, v7 -; GCN-NEXT: v_or_b32_e32 v8, v51, v8 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: v_or_b32_e32 v10, v54, v10 -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: v_or_b32_e32 v12, v23, v12 -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: v_or_b32_e32 v15, v27, v15 -; GCN-NEXT: v_or_b32_e32 v16, v28, v16 -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: v_or_b32_e32 v18, v31, v18 -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: v_or_b32_e32 v20, v33, v20 -; GCN-NEXT: v_or_b32_e32 v21, v34, v21 -; GCN-NEXT: v_or_b32_e32 v22, v35, v25 -; GCN-NEXT: v_or_b32_e32 v23, v48, v29 -; GCN-NEXT: v_or_b32_e32 v24, v53, v37 -; GCN-NEXT: v_or_b32_e32 v25, v55, v50 -; GCN-NEXT: v_or_b32_e32 v26, v40, v41 -; GCN-NEXT: v_or_b32_e32 v27, v42, v45 -; GCN-NEXT: v_or_b32_e32 v28, v43, v56 -; GCN-NEXT: v_or_b32_e32 v29, v44, v58 -; GCN-NEXT: v_or_b32_e32 v30, v46, v59 -; GCN-NEXT: v_or_b32_e32 v31, v47, v57 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 -; GCN-NEXT: .LBB37_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v128i8_to_v16f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v46 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v40 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v62 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v57 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v44 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v36 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: .LBB74_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v46, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 +; SI-NEXT: .LBB74_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v16f64: ; VI: ; %bb.0: @@ -61647,7 +126747,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB37_2 +; VI-NEXT: s_cbranch_execz .LBB74_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload @@ -62120,9 +127220,9 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB37_2: ; %Flow +; VI-NEXT: .LBB74_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB37_4 +; VI-NEXT: s_cbranch_execz .LBB74_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload @@ -62511,7 +127611,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v31, v32, v31 -; VI-NEXT: .LBB37_4: ; %end +; VI-NEXT: .LBB74_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -62883,4965 +127983,13945 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB37_2 +; GFX9-NEXT: s_cbranch_execz .LBB74_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: .LBB74_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB74_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(27) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v49, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v39, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v37, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 +; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v54 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 +; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v17, v17, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v18, v18, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v17, v17, v18 ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v19, v19, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v18, v18, v19 ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v19 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v20, v20, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v19, v19, v20 ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v21, v21, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v22, v22, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v21, v21, v22 ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v22 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v23, v23, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v22, v22, v23 ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v23 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v24, v24, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v23, v23, v24 ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v25, v25, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v24, v24, v25 ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v25 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v26, v26, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v25, v25, v26 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v27, v27, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v26, v26, v27 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v28, v28, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v27, v27, v28 ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v29, v29, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v28, v28, v29 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v30, v30, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v29, v29, v30 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v31, v31, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v30, v30, v31 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: .LBB37_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB37_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v32, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX9-NEXT: .LBB74_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:20 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB74_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB74_4 +; GFX11-TRUE16-NEXT: .LBB74_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB74_3: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v148.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v150.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v151.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v145.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v144.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v146.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v132.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v135.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v146.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v147.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v132.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v147.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v131.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v119.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v133.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v119.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v130.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v133.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v134.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v118.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v128.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v134.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v114.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v128.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v114.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v129.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v130.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v113.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v116.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v129.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v102.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v101.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v116.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v117.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v118.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v97.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v103.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_2 +; GFX11-TRUE16-NEXT: .LBB74_4: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:20 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB74_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v124 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v125 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v126 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v127 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v111 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v121 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v120 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v122 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v123 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v107 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v108 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v109 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v110 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v106 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v6, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v92 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v78 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v76 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v94 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v95 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v104 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v105 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v79 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v89 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v91 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: .LBB74_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB74_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v55, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v54, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v53, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v52, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v51, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v50, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v124, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v125, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v126, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v127, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v49, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v48, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v37, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v36, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v35, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v39, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v34, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v111, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v120, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v121, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v122, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v107, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v108, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v109, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v38, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v110, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v106, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v7, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v33, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v32, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v92, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v78, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v77, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v76, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v75, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v74, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v60, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v59, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v93, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v94, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v95, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v104, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v105, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v79, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v88, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v89, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v90, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v91, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v58, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v62, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v63, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v72, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v73, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v45, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v46, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v47, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v56, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v57, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-FAKE16-NEXT: .LBB74_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v128i8_to_v16f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB75_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: v_mov_b32_e32 v43, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_mov_b32_e32 v55, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_mov_b32_e32 v44, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 +; SI-NEXT: v_mov_b32_e32 v59, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_mov_b32_e32 v54, v37 +; SI-NEXT: v_mov_b32_e32 v37, v41 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v38, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB75_3 +; SI-NEXT: .LBB75_2: +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_mov_b32_e32 v54, v37 +; SI-NEXT: v_mov_b32_e32 v37, v41 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: .LBB75_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v63, v46 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB75_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB75_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v128i8_to_v16f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v15 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:124 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:172 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:180 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:204 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:240 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:212 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:236 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:252 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:268 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB75_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v44, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v59 +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v53, v63 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v15 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v47, v39 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v52, v48 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v45, v36 +; VI-NEXT: v_mov_b32_e32 v40, v21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v57, v1 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v49, v38 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v23, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 +; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v58, v2 +; VI-NEXT: v_mov_b32_e32 v32, v36 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v46 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v42, v48 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_mov_b32_e32 v61, v39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB75_3 +; VI-NEXT: .LBB75_2: +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v53, v63 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v45, v36 +; VI-NEXT: v_mov_b32_e32 v47, v39 +; VI-NEXT: v_mov_b32_e32 v49, v38 +; VI-NEXT: v_mov_b32_e32 v44, v24 +; VI-NEXT: v_mov_b32_e32 v40, v21 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: v_mov_b32_e32 v43, v59 +; VI-NEXT: v_mov_b32_e32 v52, v48 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: .LBB75_3: ; %Flow +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB75_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v41 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v35 +; VI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 24, v52 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_or_b32_e32 v30, v30, v31 +; VI-NEXT: v_lshlrev_b32_e32 v28, 24, v53 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s9, s17, 8 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s18, 0xff +; VI-NEXT: s_lshl_b32 s8, s19, 24 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s21, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s9, s20, 0xff +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s22, 0xff +; VI-NEXT: s_lshl_b32 s6, s23, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s9 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s5, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v63 +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s8, s8, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v42 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v44 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v40 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v47 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v56 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v61 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v33 +; VI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 +; VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v48, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v27, 24, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v50 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v43 +; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v33, v51, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v26, 24, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v49, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v50, vcc, 3, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v32 +; VI-NEXT: v_or_b32_sdwa v25, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_and_b32_e32 v33, 0xff, v49 +; VI-NEXT: v_or_b32_sdwa v32, v58, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_or_b32_sdwa v26, v26, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v33, 0xff, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_and_b32_e32 v33, 0xff, v39 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_and_b32_e32 v33, 0xff, v37 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v28, v28, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v30, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: .LBB75_5: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v128i8_to_v16f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v4 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(42) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 -; GFX9-NEXT: s_waitcnt vmcnt(27) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: s_waitcnt vmcnt(26) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(19) -; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 -; GFX9-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB75_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v38, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v49, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 -; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v39, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 -; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 -; GFX9-NEXT: v_or_b32_sdwa v6, v37, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 -; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 -; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 -; GFX9-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 -; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 -; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 -; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 -; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 -; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 -; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 -; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 -; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 -; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 -; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 -; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v54 -; GFX9-NEXT: v_add_u16_e32 v15, 3, v40 -; GFX9-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 -; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 -; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v17, v17, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v18, v18, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v61, v38 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v19, v19, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v18, v18, v19 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v19 +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v63, v57 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v20, v20, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v19, v19, v20 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v37, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v60 +; GFX9-NEXT: v_mov_b32_e32 v52, v56 +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_mov_b32_e32 v34, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v21, v21, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v21 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v22, v22, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v21, v21, v22 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v22 +; GFX9-NEXT: v_mov_b32_e32 v53, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 -; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v23, v23, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB75_3 +; GFX9-NEXT: .LBB75_2: +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_mov_b32_e32 v63, v57 +; GFX9-NEXT: v_mov_b32_e32 v53, v3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: v_mov_b32_e32 v57, v38 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: .LBB75_3: ; %Flow +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB75_5 +; GFX9-NEXT: ; %bb.4: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v61 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v23 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 -; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v24, v24, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v23, v23, v24 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v24 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 -; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v25, v25, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v24, v24, v25 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v25 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v26, v26, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v25, v25, v26 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v26 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 -; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v27, v27, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v26, v26, v27 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v27 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 -; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v28, v28, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v28 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 -; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v29, v29, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v28, v28, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v29 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v63 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 -; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v30, v30, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v29, v29, v30 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 -; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v31, v31, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v30, v30, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v32, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 -; GFX9-NEXT: .LBB37_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v46 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB75_5: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:20 -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:436 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64 +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_3 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_4 -; GFX11-TRUE16-NEXT: .LBB37_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB37_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v148.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v150.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v151.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB75_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v135.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v146.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v132.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v135.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v146.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v147.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v131.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v132.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v147.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v133.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v133.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v128.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v134.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v114.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v114.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v130.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v116.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v116.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v117.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v103.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB37_2 -; GFX11-TRUE16-NEXT: .LBB37_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB75_3 +; GFX11-TRUE16-NEXT: .LBB75_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v22, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v27, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v30, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB75_3: ; %end +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:436 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB75_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB75_2 +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 +; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:20 -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB37_2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB75_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v53 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v91 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v124 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v125 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v126 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v127 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v2, 0xff, v52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB75_3 +; GFX11-FAKE16-NEXT: .LBB75_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v55 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v54 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v52 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v90, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v92, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v93, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v88, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v74, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v75, v12 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v49 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v79, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v63, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v72, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v73, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v111 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v121 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v120 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v122 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v123 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v107 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v38 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v108 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v109 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v110 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v106 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v6, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v92 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v78 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v77 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v76 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v75 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v74 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v60 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v94 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v95 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v104 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v105 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v79 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v89 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v90 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v91 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v72 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v46 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v47 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v56 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v57 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v117, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v113, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: .LBB37_2: ; %Flow +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v34, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: .LBB75_3: ; %end +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 +; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB75_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB75_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v16f64_to_v64bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v11 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v10 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v7 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v5 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v2 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB76_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v11 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v10 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v7 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v5 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v2 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: .LBB76_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v63 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v62 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f64_to_v64bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB76_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: .LBB76_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f64_to_v64bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB76_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: .LBB76_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f64_to_v64bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB76_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: .LBB76_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f64_to_v64bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_writelane_b32 v63, s36, 4 +; SI-NEXT: v_writelane_b32 v63, s37, 5 +; SI-NEXT: v_writelane_b32 v63, s38, 6 +; SI-NEXT: v_writelane_b32 v63, s39, 7 +; SI-NEXT: v_writelane_b32 v63, s48, 8 +; SI-NEXT: v_writelane_b32 v63, s49, 9 +; SI-NEXT: v_writelane_b32 v63, s50, 10 +; SI-NEXT: v_writelane_b32 v63, s51, 11 +; SI-NEXT: v_writelane_b32 v63, s52, 12 +; SI-NEXT: v_writelane_b32 v63, s53, 13 +; SI-NEXT: v_writelane_b32 v63, s54, 14 +; SI-NEXT: v_writelane_b32 v63, s55, 15 +; SI-NEXT: v_writelane_b32 v63, s64, 16 +; SI-NEXT: v_writelane_b32 v63, s65, 17 +; SI-NEXT: v_writelane_b32 v63, s66, 18 +; SI-NEXT: v_writelane_b32 v63, s67, 19 +; SI-NEXT: v_writelane_b32 v63, s68, 20 +; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s44, v1 +; SI-NEXT: v_readfirstlane_b32 s45, v2 +; SI-NEXT: v_readfirstlane_b32 s42, v3 +; SI-NEXT: v_readfirstlane_b32 s43, v4 +; SI-NEXT: v_readfirstlane_b32 s40, v5 +; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_readfirstlane_b32 s5, v16 +; SI-NEXT: v_readfirstlane_b32 s6, v17 +; SI-NEXT: s_and_b64 s[46:47], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s7, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: s_cbranch_scc0 .LBB77_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s46, s17, 0xffff0000 +; SI-NEXT: v_writelane_b32 v62, s46, 0 +; SI-NEXT: s_lshl_b32 s46, s17, 16 +; SI-NEXT: v_writelane_b32 v62, s46, 1 +; SI-NEXT: s_and_b32 s46, s16, 0xffff0000 +; SI-NEXT: v_writelane_b32 v62, s46, 2 +; SI-NEXT: s_lshl_b32 s46, s16, 16 +; SI-NEXT: s_and_b32 s59, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s58, s7, 16 +; SI-NEXT: s_and_b32 s57, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s6, 16 +; SI-NEXT: s_and_b32 s99, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s98, s5, 16 +; SI-NEXT: s_and_b32 s97, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s96, s4, 16 +; SI-NEXT: s_and_b32 s87, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s86, s9, 16 +; SI-NEXT: s_and_b32 s85, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s84, s8, 16 +; SI-NEXT: s_and_b32 s83, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s82, s11, 16 +; SI-NEXT: s_and_b32 s81, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s80, s10, 16 +; SI-NEXT: s_and_b32 s71, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s70, s13, 16 +; SI-NEXT: s_and_b32 s69, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s68, s12, 16 +; SI-NEXT: s_and_b32 s67, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s66, s15, 16 +; SI-NEXT: s_and_b32 s65, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s14, 16 +; SI-NEXT: s_and_b32 s55, s41, 0xffff0000 +; SI-NEXT: s_lshl_b32 s54, s41, 16 +; SI-NEXT: s_and_b32 s53, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s52, s40, 16 +; SI-NEXT: s_and_b32 s51, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s50, s43, 16 +; SI-NEXT: s_and_b32 s49, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s48, s42, 16 +; SI-NEXT: s_and_b32 s39, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s38, s45, 16 +; SI-NEXT: s_and_b32 s37, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s44, 16 +; SI-NEXT: s_and_b32 s35, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s34, s29, 16 +; SI-NEXT: s_and_b32 s31, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s30, s28, 16 +; SI-NEXT: s_and_b32 s95, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s27, 16 +; SI-NEXT: s_and_b32 s93, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s26, 16 +; SI-NEXT: s_and_b32 s91, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s25, 16 +; SI-NEXT: s_and_b32 s89, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s24, 16 +; SI-NEXT: s_and_b32 s79, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s23, 16 +; SI-NEXT: s_and_b32 s77, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s22, 16 +; SI-NEXT: s_and_b32 s75, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s21, 16 +; SI-NEXT: s_and_b32 s73, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s20, 16 +; SI-NEXT: s_and_b32 s63, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s19, 16 +; SI-NEXT: s_and_b32 s61, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s18, 16 +; SI-NEXT: v_writelane_b32 v62, s46, 3 +; SI-NEXT: s_cbranch_execnz .LBB77_4 +; SI-NEXT: .LBB77_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[19:20], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: v_add_f64 v[41:42], s[24:25], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v42 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v42 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v41 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[2:3], s[20:21], 1.0 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_add_f64 v[51:52], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[49:50], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[35:36], s[44:45], 1.0 +; SI-NEXT: v_add_f64 v[31:32], s[42:43], 1.0 +; SI-NEXT: v_add_f64 v[27:28], s[40:41], 1.0 +; SI-NEXT: v_add_f64 v[23:24], s[14:15], 1.0 +; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[4:5], 1.0 +; SI-NEXT: v_add_f64 v[59:60], s[18:19], 1.0 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v50 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v49 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v52 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v52 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v60 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v60 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v59 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_branch .LBB77_5 +; SI-NEXT: .LBB77_3: +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr81 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr87 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: s_branch .LBB77_2 +; SI-NEXT: .LBB77_4: +; SI-NEXT: v_mov_b32_e32 v1, s71 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s69 +; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s68 +; SI-NEXT: v_mov_b32_e32 v61, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 2 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 3 +; SI-NEXT: v_mov_b32_e32 v5, s59 +; SI-NEXT: v_mov_b32_e32 v4, s58 +; SI-NEXT: v_mov_b32_e32 v9, s57 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v13, s99 +; SI-NEXT: v_mov_b32_e32 v10, s98 +; SI-NEXT: v_mov_b32_e32 v17, s97 +; SI-NEXT: v_mov_b32_e32 v14, s96 +; SI-NEXT: v_mov_b32_e32 v21, s87 +; SI-NEXT: v_mov_b32_e32 v18, s86 +; SI-NEXT: v_mov_b32_e32 v25, s85 +; SI-NEXT: v_mov_b32_e32 v22, s84 +; SI-NEXT: v_mov_b32_e32 v29, s83 +; SI-NEXT: v_mov_b32_e32 v26, s82 +; SI-NEXT: v_mov_b32_e32 v33, s81 +; SI-NEXT: v_mov_b32_e32 v30, s80 +; SI-NEXT: v_mov_b32_e32 v34, s70 +; SI-NEXT: v_mov_b32_e32 v8, s67 +; SI-NEXT: v_mov_b32_e32 v7, s66 +; SI-NEXT: v_mov_b32_e32 v24, s65 +; SI-NEXT: v_mov_b32_e32 v23, s64 +; SI-NEXT: v_mov_b32_e32 v16, s55 +; SI-NEXT: v_mov_b32_e32 v15, s54 +; SI-NEXT: v_mov_b32_e32 v28, s53 +; SI-NEXT: v_mov_b32_e32 v27, s52 +; SI-NEXT: v_mov_b32_e32 v12, s51 +; SI-NEXT: v_mov_b32_e32 v11, s50 +; SI-NEXT: v_mov_b32_e32 v32, s49 +; SI-NEXT: v_mov_b32_e32 v31, s48 +; SI-NEXT: v_mov_b32_e32 v20, s39 +; SI-NEXT: v_mov_b32_e32 v19, s38 +; SI-NEXT: v_mov_b32_e32 v36, s37 +; SI-NEXT: v_mov_b32_e32 v35, s36 +; SI-NEXT: v_mov_b32_e32 v38, s35 +; SI-NEXT: v_mov_b32_e32 v37, s34 +; SI-NEXT: v_mov_b32_e32 v48, s31 +; SI-NEXT: v_mov_b32_e32 v39, s30 +; SI-NEXT: v_mov_b32_e32 v50, s95 +; SI-NEXT: v_mov_b32_e32 v49, s94 +; SI-NEXT: v_mov_b32_e32 v52, s93 +; SI-NEXT: v_mov_b32_e32 v51, s92 +; SI-NEXT: v_mov_b32_e32 v54, s91 +; SI-NEXT: v_mov_b32_e32 v53, s90 +; SI-NEXT: v_mov_b32_e32 v40, s89 +; SI-NEXT: v_mov_b32_e32 v55, s88 +; SI-NEXT: v_mov_b32_e32 v42, s79 +; SI-NEXT: v_mov_b32_e32 v41, s78 +; SI-NEXT: v_mov_b32_e32 v43, s77 +; SI-NEXT: v_mov_b32_e32 v44, s76 +; SI-NEXT: v_mov_b32_e32 v46, s75 +; SI-NEXT: v_mov_b32_e32 v45, s74 +; SI-NEXT: v_mov_b32_e32 v47, s73 +; SI-NEXT: v_mov_b32_e32 v56, s72 +; SI-NEXT: v_mov_b32_e32 v58, s63 +; SI-NEXT: v_mov_b32_e32 v57, s62 +; SI-NEXT: v_mov_b32_e32 v60, s61 +; SI-NEXT: v_mov_b32_e32 v59, s60 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: .LBB77_5: ; %end +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v59 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v41 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v53 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f64_to_v64bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB77_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB77_3 +; VI-NEXT: .LBB77_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: .LBB77_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v19, v33 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB77_4: +; VI-NEXT: s_branch .LBB77_2 +; +; GFX9-LABEL: bitcast_v16f64_to_v64bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB77_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB77_3 +; GFX9-NEXT: .LBB77_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: .LBB77_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: v_mov_b32_e32 v19, v33 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB77_4: +; GFX9-NEXT: s_branch .LBB77_2 +; +; GFX11-LABEL: bitcast_v16f64_to_v64bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB77_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB77_3: +; GFX11-NEXT: .LBB77_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { +; SI-LABEL: bitcast_v64bf16_to_v16f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 +; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16 +; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16 +; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16 +; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16 +; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB78_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: .LBB78_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v16f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB78_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: .LBB78_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v16f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB78_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 +; GFX9-NEXT: .LBB78_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v32.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v32, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v14, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v38, v13, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v12, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v32, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v11, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v33 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v10, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v9, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v8, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v34 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v31, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v31.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v30, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v30 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v31, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v30.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v30, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v29, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v28, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v28, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v28 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v29.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v27, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v29, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v28.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v27, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v27.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v26, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v27, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v26, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v25, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v24, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v26, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v25.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v24, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v24 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v25, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v24.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v24, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v23, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v23.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v21, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v23, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v22.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v21, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v21.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v18 +; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v21, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v19, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v20.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v39, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v19.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v18, 0x7fff +; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v20, v32 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v16 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v19, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v17, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v37, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v16, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v16, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v17.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v18, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v17, v35 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v36, v16 +; GFX11-TRUE16-NEXT: .LBB78_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB37_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v55, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v54, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v53, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v52, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v51, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v50, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v124, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v125, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v126, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v127, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v49, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v48, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v37, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v36, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v35, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v39, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v34, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v111, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v120, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v121, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v122, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v107, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v108, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v109, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v38, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v110, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v106, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v7, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v33, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v32, 3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v92, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v78, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v77, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v76, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v75, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v74, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v60, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v59, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v93, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v94, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v95, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v104, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v105, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v79, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v88, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v89, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v90, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v91, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v58, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v62, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v63, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v72, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v73, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v45, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v46, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v47, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v56, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v57, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 -; GFX11-FAKE16-NEXT: .LBB37_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB78_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v32, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v39, v14, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v13, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v12, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v11, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v8, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v9, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v6, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v6, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v7, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <128 x i8> %a, splat (i8 3) - %a2 = bitcast <128 x i8> %a1 to <16 x double> - br label %end - -cmp.false: - %a3 = bitcast <128 x i8> %a to <16 x double> - br label %end - -end: - %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <16 x double> %phi -} - -define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f64_to_v64bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v15 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v14 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v13 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v12 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v11 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v10 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v9 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v8 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v7 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v5 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v4 -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v2 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: .LBB38_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v15 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v14 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v13 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v12 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v11 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v10 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v9 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v8 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v7 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v5 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v4 -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v2 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v1 -; GCN-NEXT: .LBB38_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v60, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v59 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v58, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v56, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 12, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v46, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 20, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_alignbit_b32 v32, v32, v33, 16 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_alignbit_b32 v34, v34, v35, 16 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_alignbit_b32 v36, v36, v37, 16 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_alignbit_b32 v38, v38, v39, 16 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_alignbit_b32 v48, v48, v49, 16 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_alignbit_b32 v50, v50, v51, 16 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_alignbit_b32 v52, v52, v53, 16 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_alignbit_b32 v54, v54, v55, 16 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_alignbit_b32 v40, v40, v41, 16 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_alignbit_b32 v42, v42, v43, 16 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v60, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v16f64_to_v64bf16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB38_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: .LBB38_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v16f64_to_v64bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB38_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: .LBB38_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v16f64_to_v64bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB38_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: .LBB38_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v30, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v29 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v30, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v29, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v28, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v27, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v26, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v23, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v21, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v21, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v38, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v19, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v18, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v17, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v17 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v36, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v16, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 +; GFX11-FAKE16-NEXT: .LBB78_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <16 x double> %a1 to <64 x bfloat> + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <16 x double> br label %end cmp.false: - %a3 = bitcast <16 x double> %a to <64 x bfloat> + %a3 = bitcast <64 x bfloat> %a to <16 x double> br label %end end: - %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <64 x bfloat> %phi + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi } -define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v64bf16_to_v16f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v33 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v42 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v63 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v0 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB39_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; GCN-NEXT: v_alignbit_b32 v0, v0, v32, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v61, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; GCN-NEXT: v_alignbit_b32 v2, v2, v59, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; GCN-NEXT: v_alignbit_b32 v3, v3, v57, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v46 -; GCN-NEXT: v_alignbit_b32 v4, v4, v47, 16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v5, v45, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v33, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v43 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v19, v19, v42, 16 -; GCN-NEXT: v_alignbit_b32 v20, v20, v44, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; GCN-NEXT: v_alignbit_b32 v22, v22, v48, 16 -; GCN-NEXT: v_alignbit_b32 v23, v23, v38, 16 -; GCN-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; GCN-NEXT: v_alignbit_b32 v25, v25, v51, 16 -; GCN-NEXT: v_alignbit_b32 v26, v26, v53, 16 -; GCN-NEXT: v_alignbit_b32 v27, v27, v55, 16 -; GCN-NEXT: v_alignbit_b32 v28, v28, v40, 16 -; GCN-NEXT: v_alignbit_b32 v29, v29, v63, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; kill: killed $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: .LBB39_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB39_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v46 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v34 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v42 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v44 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v63 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v54 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v41 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v43 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v40, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v44, 0x40c00000, v31 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v32 -; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v34 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v36 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v38 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v48 -; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v49 -; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v50 -; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v51 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v52 -; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; GCN-NEXT: v_alignbit_b32 v19, v21, v20, 16 -; GCN-NEXT: v_alignbit_b32 v20, v39, v54, 16 -; GCN-NEXT: v_alignbit_b32 v21, v48, v40, 16 -; GCN-NEXT: v_alignbit_b32 v22, v49, v22, 16 -; GCN-NEXT: v_alignbit_b32 v23, v50, v23, 16 -; GCN-NEXT: v_alignbit_b32 v24, v51, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v32, v25, 16 -; GCN-NEXT: v_alignbit_b32 v26, v33, v26, 16 -; GCN-NEXT: v_alignbit_b32 v27, v34, v27, 16 -; GCN-NEXT: v_alignbit_b32 v28, v35, v28, 16 -; GCN-NEXT: v_alignbit_b32 v29, v36, v29, 16 -; GCN-NEXT: v_alignbit_b32 v30, v37, v30, 16 -; GCN-NEXT: v_alignbit_b32 v31, v38, v31, 16 -; GCN-NEXT: .LBB39_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64bf16_to_v16f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v14, v11 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: v_mov_b32_e32 v57, v11 +; SI-NEXT: v_mov_b32_e32 v47, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 +; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 +; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 +; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v35, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v11 +; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v56, v11 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v12 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v14 +; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v14 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v53, v38 +; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 +; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: .LBB79_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB79_4: +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_mov_b32_e32 v57, v11 +; SI-NEXT: v_mov_b32_e32 v47, v10 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB79_2 ; -; VI-LABEL: bitcast_v64bf16_to_v16f64: +; VI-LABEL: bitcast_v64bf16_to_v16f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB39_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB79_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB79_3 +; VI-NEXT: .LBB79_2: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: s_movk_i32 s6, 0x7fff ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -67849,1619 +141929,2398 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 -; VI-NEXT: .LBB39_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: .LBB79_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB79_4: +; VI-NEXT: s_branch .LBB79_2 ; -; GFX9-LABEL: bitcast_v64bf16_to_v16f64: +; GFX9-LABEL: bitcast_v64bf16_to_v16f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB39_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB79_3 +; GFX9-NEXT: .LBB79_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v15 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; GFX9-NEXT: s_mov_b32 s7, 0x7060302 -; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff +; GFX9-NEXT: v_and_b32_sdwa v15, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v33, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v14 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v14, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v14, v33, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v13 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v33, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v12 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v12, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v33, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v11 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v33, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v10 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v10, v18, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v33, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v9 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v33, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v8 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v8, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v33, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v7 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v7, v18, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v33, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v6 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v5 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v5, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v33, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v4 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v33, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v3 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v33, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v2 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v2, v33, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v1 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v33, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v31 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v31 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v31, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v31, v33, 16, v31 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v30 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v30, v18, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v29 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v29 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v29, v18, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v29, v33, 16, v29 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v28 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v28 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v28 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v28, v18, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v28, v33, 16, v28 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v27 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v27 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v27 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v27, v18, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v27, v33, 16, v27 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v26 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v26 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v26 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v26, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v26, v33, 16, v26 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v25 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v25 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v25 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v25, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v25, v33, 16, v25 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v24 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v24 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v24 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v24, v18, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v24, v33, 16, v24 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v23 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v23 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v23 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v23, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v23, v33, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v22 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v22 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v22, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v22, v33, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v21 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v21 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v21 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v21, v18, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v21, v33, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v20 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v20 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v20 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v20, v33, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v19 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v19 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v19, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v19, v33, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v32 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v32 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v32, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v32, v33, 16, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v17, v33, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 -; GFX9-NEXT: .LBB39_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v16, v18, 16, v16 +; GFX9-NEXT: .LBB79_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB79_4: +; GFX9-NEXT: s_branch .LBB79_2 ; -; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64: +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB39_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v33, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v32.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v15, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37 -; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v14 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v32, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v14, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v33 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v38, v13, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 +; GFX11-TRUE16-NEXT: s_clause 0x8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v183, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v182, v9 :: v_dual_mov_b32 v169, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v176, v3 :: v_dual_mov_b32 v175, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v173, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB79_3 +; GFX11-TRUE16-NEXT: .LBB79_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s5, s27, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s27, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v12, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v32, v35, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v11, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v11, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v33 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v10, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v10, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v32 -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v9, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v8, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v8, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 +; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v23, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v24, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v26, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 +; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 +; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v30, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v34 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v31, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v31.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v30, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v30 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v29 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v31, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v30.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v29, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v30, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v29, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v32, v36, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v28, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v28, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v28 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v29.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v183 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v36, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v27, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v29, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v28.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v27, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v182 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v27.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v26, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v27, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v26, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v25, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v25, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v24, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v26, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v25.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v24, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v24 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v25, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v183, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v182, v32, 16, v33 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v177 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v177 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v38, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v23, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v24, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v23, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v22, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v22 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v23.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v21, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v23, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v22.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v177, v33, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v33 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v32, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v20 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v21.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v18 -; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v21, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v176 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v19, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v19, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v39, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v19.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v18, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v20, v32 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v37, 16, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v16 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v19, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v17, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v17, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v16, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v16, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v17.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v176, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v18, v34 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v48, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v17, v35 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v36, v16 -; GFX11-TRUE16-NEXT: .LBB39_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16f64: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB39_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v14, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v32, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v39, v14, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v15, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v15 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v14 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v13, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v13, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v12, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v11, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v11, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v8, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v8, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v9, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v181 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v36, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v48, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 +; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v50, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v181, v39, 16, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-TRUE16-NEXT: .LBB79_3: ; %end +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v181 :: v_dual_mov_b32 v19, v175 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v21, v176 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 +; GFX11-TRUE16-NEXT: s_clause 0x8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v177 :: v_dual_mov_b32 v27, v182 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v183 :: v_dual_mov_b32 v31, v178 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v179 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB79_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-TRUE16-NEXT: s_branch .LBB79_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 +; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB79_3 +; GFX11-FAKE16-NEXT: .LBB79_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_and_b32 s5, s27, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v6, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v6, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v7, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v5 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 +; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v31, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v30, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v29 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v30, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v29, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v28, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v28, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v27, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v26, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v25 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v26, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v24, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v23 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v23, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v22 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v21, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v21, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v38, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v19, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v19, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v19 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v18, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v18, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v16 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v17, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v17 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v16, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v36, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v16, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB39_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-FAKE16-NEXT: .LBB79_3: ; %end +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 +; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB79_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-FAKE16-NEXT: s_branch .LBB79_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -69480,740 +144339,766 @@ end: } define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f64_to_v64f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v49 -; GCN-NEXT: v_mov_b32_e32 v49, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: .LBB40_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v25 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v26 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v27 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v3 -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v3 -; GCN-NEXT: .LBB40_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v63, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v46, v2, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v61 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v61, v3, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v60 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v60, v4, v3 -; GCN-NEXT: v_add_i32_e32 v62, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v59 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v59, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v58 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v58, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v57 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v56 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v47 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v45 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v44 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v42 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v41 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v40 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v53 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v52 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v51 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v50 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v48 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v38 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v37 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v35 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v39 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v49 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v63, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f64_to_v64f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v10 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v39 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v35 +; SI-NEXT: v_mov_b32_e32 v35, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v36 +; SI-NEXT: v_mov_b32_e32 v36, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v40 +; SI-NEXT: v_mov_b32_e32 v40, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB80_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v27 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_add_f64 v[35:36], v[11:12], 1.0 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v32 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: .LBB80_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v64f16: ; VI: ; %bb.0: @@ -70225,7 +145110,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: s_cbranch_execz .LBB80_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -70244,7 +145129,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: .LBB40_2: ; %end +; VI-NEXT: .LBB80_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -70259,7 +145144,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: s_cbranch_execz .LBB80_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -70278,7 +145163,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: .LBB40_2: ; %end +; GFX9-NEXT: .LBB80_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -70295,7 +145180,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-NEXT: s_cbranch_execz .LBB80_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -70314,7 +145199,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: .LBB40_2: ; %end +; GFX11-NEXT: .LBB80_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -70335,768 +145220,1602 @@ end: ret <64 x half> %phi } +define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f64_to_v64f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_readfirstlane_b32 s44, v1 +; SI-NEXT: v_readfirstlane_b32 s45, v2 +; SI-NEXT: v_readfirstlane_b32 s42, v3 +; SI-NEXT: v_readfirstlane_b32 s43, v4 +; SI-NEXT: v_readfirstlane_b32 s40, v5 +; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_and_b64 s[46:47], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB81_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s46, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s46 +; SI-NEXT: s_lshr_b32 s46, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s46 +; SI-NEXT: s_lshr_b32 s46, s7, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s46 +; SI-NEXT: s_lshr_b32 s46, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s46 +; SI-NEXT: s_lshr_b32 s46, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s46 +; SI-NEXT: s_lshr_b32 s46, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s46 +; SI-NEXT: s_lshr_b32 s46, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s46 +; SI-NEXT: s_lshr_b32 s46, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s46 +; SI-NEXT: s_lshr_b32 s46, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s46 +; SI-NEXT: s_lshr_b32 s46, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s46 +; SI-NEXT: s_lshr_b32 s46, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s46 +; SI-NEXT: s_lshr_b32 s46, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s46 +; SI-NEXT: s_lshr_b32 s46, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s46 +; SI-NEXT: s_lshr_b32 s46, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s46 +; SI-NEXT: s_lshr_b32 s46, s43, 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, s46 +; SI-NEXT: s_lshr_b32 s46, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s46 +; SI-NEXT: s_lshr_b32 s46, s45, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s46 +; SI-NEXT: s_lshr_b32 s46, s44, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s46 +; SI-NEXT: s_lshr_b32 s46, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s46 +; SI-NEXT: s_lshr_b32 s46, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s46 +; SI-NEXT: s_lshr_b32 s46, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s46 +; SI-NEXT: s_lshr_b32 s46, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s46 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 +; SI-NEXT: s_lshr_b32 s46, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s46 +; SI-NEXT: s_lshr_b32 s46, s23, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s46 +; SI-NEXT: s_lshr_b32 s46, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s46 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s46 +; SI-NEXT: s_lshr_b32 s46, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s46 +; SI-NEXT: s_lshr_b32 s46, s19, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s46 +; SI-NEXT: s_lshr_b32 s46, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s46 +; SI-NEXT: s_lshr_b32 s46, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s46 +; SI-NEXT: s_lshr_b32 s46, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v37, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_cbranch_execnz .LBB81_3 +; SI-NEXT: .LBB81_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[52:53], s[24:25], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_add_f64 v[48:49], s[26:27], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_add_f64 v[36:37], s[28:29], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v49 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v37 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_add_f64 v[29:30], s[42:43], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[25:26], s[40:41], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_add_f64 v[21:22], s[14:15], 1.0 +; SI-NEXT: v_add_f64 v[33:34], s[44:45], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: v_add_f64 v[1:2], s[18:19], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v25 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 +; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v7 +; SI-NEXT: v_mov_b32_e32 v7, v61 +; SI-NEXT: v_mov_b32_e32 v61, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_add_f64 v[46:47], s[20:21], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v47 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v47 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v47, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 +; SI-NEXT: v_mov_b32_e32 v15, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 +; SI-NEXT: v_mov_b32_e32 v14, v12 +; SI-NEXT: v_mov_b32_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v4 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v52, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: .LBB81_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 +; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v33 +; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 +; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 +; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v53 +; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 +; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 +; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v63 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v43 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v41 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v51 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v19 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB81_4: +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: s_branch .LBB81_2 +; +; VI-LABEL: bitcast_v16f64_to_v64f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB81_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB81_3 +; VI-NEXT: .LBB81_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: .LBB81_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v19, v33 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB81_4: +; VI-NEXT: s_branch .LBB81_2 +; +; GFX9-LABEL: bitcast_v16f64_to_v64f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB81_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB81_3 +; GFX9-NEXT: .LBB81_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: .LBB81_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: v_mov_b32_e32 v19, v33 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB81_4: +; GFX9-NEXT: s_branch .LBB81_2 +; +; GFX11-LABEL: bitcast_v16f64_to_v64f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB81_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB81_3: +; GFX11-NEXT: .LBB81_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v64f16_to_v16f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v43 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v50 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v37 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v1 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB41_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v61 -; GCN-NEXT: v_or_b32_e32 v0, v62, v0 -; GCN-NEXT: v_or_b32_e32 v1, v60, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; GCN-NEXT: v_or_b32_e32 v2, v58, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v57 -; GCN-NEXT: v_or_b32_e32 v3, v56, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v47 -; GCN-NEXT: v_or_b32_e32 v4, v46, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v51 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v44 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v32 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v37 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: v_or_b32_e32 v19, v43, v19 -; GCN-NEXT: v_or_b32_e32 v20, v41, v20 -; GCN-NEXT: v_or_b32_e32 v21, v55, v21 -; GCN-NEXT: v_or_b32_e32 v22, v49, v22 -; GCN-NEXT: v_or_b32_e32 v23, v50, v23 -; GCN-NEXT: v_or_b32_e32 v24, v39, v24 -; GCN-NEXT: v_or_b32_e32 v25, v36, v25 -; GCN-NEXT: v_or_b32_e32 v26, v48, v26 -; GCN-NEXT: v_or_b32_e32 v27, v52, v27 -; GCN-NEXT: v_or_b32_e32 v28, v53, v28 -; GCN-NEXT: v_or_b32_e32 v29, v54, v29 -; GCN-NEXT: v_or_b32_e32 v30, v40, v30 -; GCN-NEXT: v_or_b32_e32 v31, v42, v31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: .LBB41_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB41_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v58 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v56 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v51 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v41 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v55 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v50 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v39 -; GCN-NEXT: v_mov_b32_e32 v39, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v48 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_or_b32_e32 v19, v21, v20 -; GCN-NEXT: v_or_b32_e32 v20, v55, v39 -; GCN-NEXT: v_or_b32_e32 v21, v41, v48 -; GCN-NEXT: v_or_b32_e32 v22, v22, v49 -; GCN-NEXT: v_or_b32_e32 v23, v23, v50 -; GCN-NEXT: v_or_b32_e32 v24, v24, v51 -; GCN-NEXT: v_or_b32_e32 v25, v25, v32 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v38 -; GCN-NEXT: v_or_b32_e32 v28, v28, v33 -; GCN-NEXT: v_or_b32_e32 v29, v29, v34 -; GCN-NEXT: v_or_b32_e32 v30, v30, v35 -; GCN-NEXT: v_or_b32_e32 v31, v31, v37 -; GCN-NEXT: .LBB41_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64f16_to_v16f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v36 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v58, v2 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_or_b32_e32 v25, v51, v25 +; SI-NEXT: v_or_b32_e32 v26, v48, v26 +; SI-NEXT: v_or_b32_e32 v27, v52, v27 +; SI-NEXT: v_or_b32_e32 v28, v39, v28 +; SI-NEXT: v_or_b32_e32 v29, v37, v29 +; SI-NEXT: v_or_b32_e32 v30, v35, v30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: v_or_b32_e32 v24, v55, v24 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB82_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v37 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v52 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: .LBB82_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v16f64: ; VI: ; %bb.0: @@ -71108,7 +146827,7 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB41_2 +; VI-NEXT: s_cbranch_execz .LBB82_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 0x200 ; VI-NEXT: v_add_f16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -71208,7 +146927,7 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 ; VI-NEXT: v_or_b32_e32 v17, v17, v33 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -71223,7 +146942,7 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB41_2 +; GFX9-NEXT: s_cbranch_execz .LBB82_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -71259,7 +146978,7 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: .LBB82_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -71276,7 +146995,7 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -71311,7 +147030,7 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB41_2: ; %end +; GFX11-NEXT: .LBB82_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -71332,490 +147051,1526 @@ end: ret <16 x double> %phi } +define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64f16_to_v16f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v26 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v41, v11 +; SI-NEXT: v_mov_b32_e32 v40, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v37 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v62 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v63 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB83_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_mov_b32_e32 v33, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_or_b32_e32 v22, v51, v22 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v50, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v49, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v48, v25 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v26, v39, v26 +; SI-NEXT: v_mov_b32_e32 v39, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v27, v38, v27 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v28, v37, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_or_b32_e32 v9, v14, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_or_b32_e32 v19, v54, v19 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v2, v60, v2 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_or_b32_e32 v13, v57, v13 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v15, v43, v15 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v40, v17 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_or_b32_e32 v18, v55, v18 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB83_3 +; SI-NEXT: .LBB83_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_mov_b32_e32 v33, v52 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_mov_b32_e32 v39, v27 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB83_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v35, v40 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v40, v46 +; SI-NEXT: v_mov_b32_e32 v41, v56 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v43, v60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB83_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_mov_b32_e32 v55, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: .LBB83_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v16f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB83_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB83_3 +; VI-NEXT: .LBB83_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v18, 0x200 +; VI-NEXT: v_add_f16_sdwa v33, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v33 +; VI-NEXT: v_add_f16_sdwa v33, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v33 +; VI-NEXT: v_add_f16_sdwa v33, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v33 +; VI-NEXT: v_add_f16_sdwa v33, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v33 +; VI-NEXT: v_add_f16_sdwa v33, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v33 +; VI-NEXT: v_add_f16_sdwa v33, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v33 +; VI-NEXT: v_add_f16_sdwa v33, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v33 +; VI-NEXT: v_add_f16_sdwa v33, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v33 +; VI-NEXT: v_add_f16_sdwa v33, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v33 +; VI-NEXT: v_add_f16_sdwa v33, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v33 +; VI-NEXT: v_add_f16_sdwa v33, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v33 +; VI-NEXT: v_add_f16_sdwa v33, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v33 +; VI-NEXT: v_add_f16_sdwa v33, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v33 +; VI-NEXT: v_add_f16_sdwa v33, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v33 +; VI-NEXT: v_add_f16_sdwa v33, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v33 +; VI-NEXT: v_add_f16_sdwa v33, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v33 +; VI-NEXT: v_add_f16_sdwa v33, v31, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_or_b32_e32 v31, v31, v33 +; VI-NEXT: v_add_f16_sdwa v33, v30, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_or_b32_e32 v30, v30, v33 +; VI-NEXT: v_add_f16_sdwa v33, v29, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_f16_sdwa v33, v28, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_add_f16_sdwa v33, v27, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_add_f16_sdwa v33, v26, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_add_f16_sdwa v33, v25, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v33 +; VI-NEXT: v_add_f16_sdwa v33, v24, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v33 +; VI-NEXT: v_add_f16_sdwa v33, v23, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v33 +; VI-NEXT: v_add_f16_sdwa v33, v22, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v33 +; VI-NEXT: v_add_f16_sdwa v33, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_or_b32_e32 v21, v21, v33 +; VI-NEXT: v_add_f16_sdwa v33, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_or_b32_e32 v20, v20, v33 +; VI-NEXT: v_add_f16_sdwa v33, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_or_b32_e32 v19, v19, v33 +; VI-NEXT: v_add_f16_sdwa v33, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v32, v32, v33 +; VI-NEXT: v_add_f16_sdwa v33, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v18, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v17, v33 +; VI-NEXT: v_or_b32_e32 v16, v16, v18 +; VI-NEXT: .LBB83_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB83_4: +; VI-NEXT: s_branch .LBB83_2 +; +; GFX9-LABEL: bitcast_v64f16_to_v16f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB83_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB83_3 +; GFX9-NEXT: .LBB83_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v31, v31, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, v32, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB83_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB83_4: +; GFX9-NEXT: s_branch .LBB83_2 +; +; GFX11-LABEL: bitcast_v64f16_to_v16f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB83_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB83_3 +; GFX11-NEXT: .LBB83_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB83_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB83_4: +; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: s_branch .LBB83_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f64_to_v64i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v47, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v45, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v63, v2, v1, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: .LBB42_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v47, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v45, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v63, v2, v1, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: .LBB42_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v63 -; GCN-NEXT: v_or_b32_e32 v1, v1, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v44, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v30 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_and_b32_e32 v52, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v63, 0xffff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v45 -; GCN-NEXT: v_or_b32_e32 v46, v44, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v62 -; GCN-NEXT: v_or_b32_e32 v62, v42, v6 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 8, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; GCN-NEXT: v_or_b32_e32 v44, v41, v6 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v61 -; GCN-NEXT: v_or_b32_e32 v45, v4, v6 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v4 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 20, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; GCN-NEXT: v_or_b32_e32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v3, v3, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 28, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v59 -; GCN-NEXT: v_or_b32_e32 v5, v5, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v7, v7, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v58 -; GCN-NEXT: v_or_b32_e32 v9, v9, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v11, v11, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v43 -; GCN-NEXT: v_or_b32_e32 v13, v13, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v15, v15, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v17, v40, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v19, v55, v19 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v21, v53, v21 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v23, v51, v23 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v25, v36, v25 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v27, v49, v27 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v29, v38, v29 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v37, v31 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v33, v33, v36 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v48, v36 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v39, v39, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v35, v35, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v50, v50, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v34, v34, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v54, v54, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v52, v52, v41 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v41, v63, v41 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v46, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v61, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f64_to_v64i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v32, v31, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v49, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v51, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v40, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v42, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: .LBB84_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_alignbit_b32 v33, v32, v31, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v49, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v51, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v40, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v42, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: .LBB84_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v64i16: ; VI: ; %bb.0: @@ -71827,7 +148582,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: s_cbranch_execz .LBB84_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -71846,7 +148601,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: .LBB42_2: ; %end +; VI-NEXT: .LBB84_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -71861,7 +148616,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: s_cbranch_execz .LBB84_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -71880,7 +148635,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: .LBB42_2: ; %end +; GFX9-NEXT: .LBB84_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -71897,7 +148652,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-NEXT: s_cbranch_execz .LBB84_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -71916,7 +148671,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: .LBB42_2: ; %end +; GFX11-NEXT: .LBB84_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -71937,613 +148692,1149 @@ end: ret <64 x i16> %phi } +define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f64_to_v64i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_mov_b32_e32 v31, s16 +; SI-NEXT: v_mov_b32_e32 v32, s17 +; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: v_mov_b32_e32 v30, s19 +; SI-NEXT: v_mov_b32_e32 v27, s20 +; SI-NEXT: v_mov_b32_e32 v28, s21 +; SI-NEXT: v_mov_b32_e32 v25, s22 +; SI-NEXT: v_mov_b32_e32 v26, s23 +; SI-NEXT: v_mov_b32_e32 v23, s24 +; SI-NEXT: v_mov_b32_e32 v24, s25 +; SI-NEXT: v_mov_b32_e32 v21, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_mov_b32_e32 v20, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB85_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v36, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v37, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v39, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v49, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v51, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v54, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v40, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v42, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v45, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v47, v28, v27, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v30, v29, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: s_cbranch_execnz .LBB85_3 +; SI-NEXT: .LBB85_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v36, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v37, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v39, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v49, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v51, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v54, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v40, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v42, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v45, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v47, v28, v27, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v30, v29, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: .LBB85_3: ; %end +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v60 +; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v58 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v63 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v47 +; SI-NEXT: v_or_b32_e32 v27, v27, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v62 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v45 +; SI-NEXT: v_or_b32_e32 v25, v25, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v61 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v42 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v59 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v57 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v54 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB85_4: +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_branch .LBB85_2 +; +; VI-LABEL: bitcast_v16f64_to_v64i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB85_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB85_3 +; VI-NEXT: .LBB85_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: .LBB85_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v19, v33 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB85_4: +; VI-NEXT: s_branch .LBB85_2 +; +; GFX9-LABEL: bitcast_v16f64_to_v64i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB85_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB85_3 +; GFX9-NEXT: .LBB85_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: .LBB85_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: v_mov_b32_e32 v19, v33 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB85_4: +; GFX9-NEXT: s_branch .LBB85_2 +; +; GFX11-LABEL: bitcast_v16f64_to_v64i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB85_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB85_3: +; GFX11-NEXT: .LBB85_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i16_to_v16f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(12) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v24 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB43_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v36 -; GCN-NEXT: v_or_b32_e32 v1, v1, v58 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v57 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v60 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v43 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v56 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v62 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v61 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v47 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v44 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v30, v32 -; GCN-NEXT: v_or_b32_e32 v31, v31, v59 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: .LBB43_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB43_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v36, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v58, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v57, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v35, v3 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v61 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v4, v60, v4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v32, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v32, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v32, v15 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v32, v16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v32, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v32, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v32, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v32, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v32, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v32, v30 -; GCN-NEXT: v_or_b32_e32 v31, v59, v31 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 -; GCN-NEXT: .LBB43_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(11) -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i16_to_v16f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB86_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v23, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v53 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v19, v19, v48 +; SI-NEXT: v_or_b32_e32 v21, v21, v36 +; SI-NEXT: v_or_b32_e32 v22, v22, v34 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v49 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v62 +; SI-NEXT: v_or_b32_e32 v2, v2, v61 +; SI-NEXT: v_or_b32_e32 v3, v3, v60 +; SI-NEXT: v_or_b32_e32 v4, v4, v59 +; SI-NEXT: v_or_b32_e32 v5, v5, v58 +; SI-NEXT: v_or_b32_e32 v6, v6, v57 +; SI-NEXT: v_or_b32_e32 v7, v7, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v47 +; SI-NEXT: v_or_b32_e32 v9, v9, v46 +; SI-NEXT: v_or_b32_e32 v10, v10, v45 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_or_b32_e32 v12, v12, v42 +; SI-NEXT: v_or_b32_e32 v13, v13, v41 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v54 +; SI-NEXT: v_or_b32_e32 v20, v20, v38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: .LBB86_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB86_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: v_or_b32_e32 v21, v36, v21 +; SI-NEXT: v_or_b32_e32 v22, v34, v22 +; SI-NEXT: v_or_b32_e32 v23, v32, v23 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_or_b32_e32 v4, v59, v4 +; SI-NEXT: v_or_b32_e32 v5, v58, v5 +; SI-NEXT: v_or_b32_e32 v6, v57, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v47, v8 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v10, v45, v10 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_or_b32_e32 v13, v41, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v54, v15 +; SI-NEXT: v_or_b32_e32 v18, v49, v18 +; SI-NEXT: v_or_b32_e32 v20, v38, v20 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v51, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v37, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 +; SI-NEXT: .LBB86_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v16f64: ; VI: ; %bb.0: @@ -72555,7 +149846,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB43_2 +; VI-NEXT: s_cbranch_execz .LBB86_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v33, 3 ; VI-NEXT: v_add_u16_e32 v32, 3, v15 @@ -72655,7 +149946,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v32, v16, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB43_2: ; %end +; VI-NEXT: .LBB86_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -72670,7 +149961,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB43_2 +; GFX9-NEXT: s_cbranch_execz .LBB86_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -72705,7 +149996,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB43_2: ; %end +; GFX9-NEXT: .LBB86_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -72722,7 +150013,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB43_2 +; GFX11-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -72757,7 +150048,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB43_2: ; %end +; GFX11-NEXT: .LBB86_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -72778,2153 +150069,3194 @@ end: ret <16 x double> %phi } +define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i16_to_v16f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v56, v10 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB87_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v10, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v16 +; SI-NEXT: v_or_b32_e32 v16, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v17, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v19, v0, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_or_b32_e32 v20, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v21, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v22, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_or_b32_e32 v23, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 +; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 +; SI-NEXT: v_mov_b32_e32 v26, v27 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_or_b32_e32 v28, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_or_b32_e32 v29, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v30, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_or_b32_e32 v8, v1, v55 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB87_3 +; SI-NEXT: .LBB87_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: .LBB87_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB87_4: +; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_mov_b32_e32 v56, v16 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB87_2 +; +; VI-LABEL: bitcast_v64i16_to_v16f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v32, s30, 0 +; VI-NEXT: v_writelane_b32 v32, s31, 1 +; VI-NEXT: v_writelane_b32 v32, s34, 2 +; VI-NEXT: v_writelane_b32 v32, s35, 3 +; VI-NEXT: v_writelane_b32 v32, s36, 4 +; VI-NEXT: v_writelane_b32 v32, s37, 5 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_writelane_b32 v32, s38, 6 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: v_readfirstlane_b32 s46, v3 +; VI-NEXT: v_readfirstlane_b32 s45, v4 +; VI-NEXT: v_readfirstlane_b32 s44, v5 +; VI-NEXT: v_readfirstlane_b32 s43, v6 +; VI-NEXT: v_readfirstlane_b32 s42, v7 +; VI-NEXT: v_readfirstlane_b32 s41, v8 +; VI-NEXT: v_readfirstlane_b32 s40, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 +; VI-NEXT: v_readfirstlane_b32 s13, v12 +; VI-NEXT: v_readfirstlane_b32 s12, v13 +; VI-NEXT: v_readfirstlane_b32 s11, v14 +; VI-NEXT: v_readfirstlane_b32 s10, v15 +; VI-NEXT: v_readfirstlane_b32 s9, v16 +; VI-NEXT: v_readfirstlane_b32 s8, v17 +; VI-NEXT: v_readfirstlane_b32 s7, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s6, v1 +; VI-NEXT: v_writelane_b32 v32, s39, 7 +; VI-NEXT: s_cbranch_scc0 .LBB87_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB87_3 +; VI-NEXT: .LBB87_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s47, 3 +; VI-NEXT: s_and_b32 s47, s46, 0xffff0000 +; VI-NEXT: s_add_i32 s46, s46, 3 +; VI-NEXT: s_and_b32 s56, s45, 0xffff0000 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_and_b32 s57, s44, 0xffff0000 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_and_b32 s58, s43, 0xffff0000 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_and_b32 s59, s42, 0xffff0000 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_and_b32 s60, s41, 0xffff0000 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_and_b32 s61, s40, 0xffff0000 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_and_b32 s62, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_and_b32 s63, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_and_b32 s72, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_and_b32 s73, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_and_b32 s74, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_and_b32 s75, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_and_b32 s76, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s77, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_and_b32 s78, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s79, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s88, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s89, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s90, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s91, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_and_b32 vcc_lo, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_and_b32 vcc_hi, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_and_b32 s30, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s31, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s34, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s35, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s36, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s37, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s38, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s39, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s40, s40, 0xffff +; VI-NEXT: s_and_b32 s41, s41, 0xffff +; VI-NEXT: s_and_b32 s42, s42, 0xffff +; VI-NEXT: s_and_b32 s43, s43, 0xffff +; VI-NEXT: s_and_b32 s44, s44, 0xffff +; VI-NEXT: s_and_b32 s45, s45, 0xffff +; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s6, s39, s6 +; VI-NEXT: s_or_b32 s7, s38, s7 +; VI-NEXT: s_or_b32 s29, s37, s29 +; VI-NEXT: s_or_b32 s28, s36, s28 +; VI-NEXT: s_or_b32 s27, s35, s27 +; VI-NEXT: s_or_b32 s26, s34, s26 +; VI-NEXT: s_or_b32 s25, s31, s25 +; VI-NEXT: s_or_b32 s24, s30, s24 +; VI-NEXT: s_or_b32 s23, vcc_hi, s23 +; VI-NEXT: s_or_b32 s22, vcc_lo, s22 +; VI-NEXT: s_or_b32 s21, s91, s21 +; VI-NEXT: s_or_b32 s20, s90, s20 +; VI-NEXT: s_or_b32 s19, s89, s19 +; VI-NEXT: s_or_b32 s18, s88, s18 +; VI-NEXT: s_or_b32 s17, s79, s17 +; VI-NEXT: s_or_b32 s16, s78, s16 +; VI-NEXT: s_or_b32 s8, s77, s8 +; VI-NEXT: s_or_b32 s9, s76, s9 +; VI-NEXT: s_or_b32 s10, s75, s10 +; VI-NEXT: s_or_b32 s11, s74, s11 +; VI-NEXT: s_or_b32 s12, s73, s12 +; VI-NEXT: s_or_b32 s13, s72, s13 +; VI-NEXT: s_or_b32 s14, s63, s14 +; VI-NEXT: s_or_b32 s15, s62, s15 +; VI-NEXT: s_or_b32 s40, s61, s40 +; VI-NEXT: s_or_b32 s41, s60, s41 +; VI-NEXT: s_or_b32 s42, s59, s42 +; VI-NEXT: s_or_b32 s43, s58, s43 +; VI-NEXT: s_or_b32 s44, s57, s44 +; VI-NEXT: s_or_b32 s45, s56, s45 +; VI-NEXT: s_or_b32 s46, s47, s46 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_add_i32 s25, s25, 0x30000 +; VI-NEXT: s_add_i32 s24, s24, 0x30000 +; VI-NEXT: s_add_i32 s23, s23, 0x30000 +; VI-NEXT: s_add_i32 s22, s22, 0x30000 +; VI-NEXT: s_add_i32 s21, s21, 0x30000 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s40, s40, 0x30000 +; VI-NEXT: s_add_i32 s41, s41, 0x30000 +; VI-NEXT: s_add_i32 s42, s42, 0x30000 +; VI-NEXT: s_add_i32 s43, s43, 0x30000 +; VI-NEXT: s_add_i32 s44, s44, 0x30000 +; VI-NEXT: s_add_i32 s45, s45, 0x30000 +; VI-NEXT: s_add_i32 s46, s46, 0x30000 +; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: .LBB87_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s7 +; VI-NEXT: v_mov_b32_e32 v15, s6 +; VI-NEXT: v_mov_b32_e32 v16, s47 +; VI-NEXT: v_mov_b32_e32 v17, s46 +; VI-NEXT: v_mov_b32_e32 v18, s45 +; VI-NEXT: v_mov_b32_e32 v19, s44 +; VI-NEXT: v_mov_b32_e32 v20, s43 +; VI-NEXT: v_mov_b32_e32 v21, s42 +; VI-NEXT: v_mov_b32_e32 v22, s41 +; VI-NEXT: v_mov_b32_e32 v23, s40 +; VI-NEXT: v_mov_b32_e32 v24, s15 +; VI-NEXT: v_mov_b32_e32 v25, s14 +; VI-NEXT: v_mov_b32_e32 v26, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v28, s11 +; VI-NEXT: v_mov_b32_e32 v29, s10 +; VI-NEXT: v_mov_b32_e32 v30, s9 +; VI-NEXT: v_mov_b32_e32 v31, s8 +; VI-NEXT: v_readlane_b32 s39, v32, 7 +; VI-NEXT: v_readlane_b32 s38, v32, 6 +; VI-NEXT: v_readlane_b32 s37, v32, 5 +; VI-NEXT: v_readlane_b32 s36, v32, 4 +; VI-NEXT: v_readlane_b32 s35, v32, 3 +; VI-NEXT: v_readlane_b32 s34, v32, 2 +; VI-NEXT: v_readlane_b32 s31, v32, 1 +; VI-NEXT: v_readlane_b32 s30, v32, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB87_4: +; VI-NEXT: s_branch .LBB87_2 +; +; GFX9-LABEL: bitcast_v64i16_to_v16f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB87_3 +; GFX9-NEXT: .LBB87_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB87_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB87_4: +; GFX9-NEXT: s_branch .LBB87_2 +; +; GFX11-LABEL: bitcast_v64i16_to_v16f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB87_3 +; GFX11-NEXT: .LBB87_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB87_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB87_4: +; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: s_branch .LBB87_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v128i8_to_v64bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v6 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v12 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v14 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v16 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v20 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v24 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v28 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v30 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:392 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v11 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v10 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v7 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v5 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:216 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:280 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:332 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:328 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:344 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:376 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:364 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:360 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB44_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v47, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v55, v1, v15 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v40, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v41, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v44, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v45, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v46, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GCN-NEXT: v_or_b32_e32 v22, v1, v22 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v16 -; GCN-NEXT: v_or_b32_e32 v16, v1, v52 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v56 -; GCN-NEXT: v_or_b32_e32 v5, v1, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v31 -; GCN-NEXT: v_or_b32_e32 v13, v1, v13 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v21, v1, v21 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v29 -; GCN-NEXT: v_or_b32_e32 v23, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v3, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v33, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v6, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v10, v9, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v12, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v17, v12, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v12, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v12, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v25, v12, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v12, v8 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v37, v14, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v14, v12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v20, v14 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v24, v20 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v27, v24 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v29, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v38, v19, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v19, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v19, v11 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v49, v19, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v19, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v19, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v23 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; kill: killed $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; kill: killed $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; kill: killed $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; kill: killed $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; kill: killed $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; kill: killed $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; kill: killed $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: .LBB44_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB44_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v21, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v19 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v6, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v56 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v5, v5, v8 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v1 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v1 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v35 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v1 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v1, v13 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: v_mov_b32_e32 v2, v15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v1 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v1, v15 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v1 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v1, v17 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v1 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v1, v19 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v1 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v1, v21 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v1 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v28, v25, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v1, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v37, v25, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v40, v2, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v45, v25, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v57, v1, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v58, v25, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v59, v1, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v60, v25, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v61, v1, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v1, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v1, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v2, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v2, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v2, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v2, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v2, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v2, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v55, v2, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v2, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v2, v25 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v2, v26 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v2, v27 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v2, v29 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v2, v30 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v2, v31 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v2, v33 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v2, v34 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v2, v35 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v2, v36 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v2, v38 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v50, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v2, v50 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v54, v2, v54 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v41, v2, v41 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v2, v42 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v2, v43 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v44, v2, v44 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v46, v2, v46 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v47, v2, v47 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v56, 0xff, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v56, v2, v56 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v61, v61, v2 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v63, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v1, v4 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v11 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v15 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v17 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s7, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s7, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, s7, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s7, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, s7, v45 -; GCN-NEXT: v_add_i32_e32 v45, vcc, s7, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, s7, v58 -; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v59 -; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v60 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_or_b32_e32 v6, v48, v6 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_or_b32_e32 v7, v51, v7 -; GCN-NEXT: v_or_b32_e32 v8, v52, v8 -; GCN-NEXT: v_or_b32_e32 v9, v53, v9 -; GCN-NEXT: v_or_b32_e32 v10, v55, v10 -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: v_or_b32_e32 v14, v27, v14 -; GCN-NEXT: v_or_b32_e32 v15, v29, v15 -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: v_or_b32_e32 v17, v31, v17 -; GCN-NEXT: v_or_b32_e32 v18, v33, v18 -; GCN-NEXT: v_or_b32_e32 v19, v34, v19 -; GCN-NEXT: v_or_b32_e32 v20, v35, v20 -; GCN-NEXT: v_or_b32_e32 v21, v36, v21 -; GCN-NEXT: v_or_b32_e32 v22, v38, v22 -; GCN-NEXT: v_or_b32_e32 v23, v50, v23 -; GCN-NEXT: v_or_b32_e32 v24, v54, v28 -; GCN-NEXT: v_or_b32_e32 v25, v41, v32 -; GCN-NEXT: v_or_b32_e32 v26, v42, v37 -; GCN-NEXT: v_or_b32_e32 v27, v43, v40 -; GCN-NEXT: v_or_b32_e32 v28, v44, v45 -; GCN-NEXT: v_or_b32_e32 v29, v46, v57 -; GCN-NEXT: v_or_b32_e32 v30, v47, v58 -; GCN-NEXT: v_or_b32_e32 v31, v56, v59 -; GCN-NEXT: v_add_i32_e32 v35, vcc, s6, v61 -; GCN-NEXT: v_add_i32_e32 v49, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v38, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v39, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v51, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v29 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v30 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v31 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v20 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v14 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v7 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v6 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v9 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v11 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v37 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v52 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v51 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v5 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v48 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v39 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v50 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v35 -; GCN-NEXT: .LBB44_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_alignbit_b32 v11, v11, v13, 16 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v11, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v63, v5, v11, 16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v56, v5, v11, 16 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 8, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v62, v5, v11, 16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 12, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v61 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v61, v5, v11, 16 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 20, v0 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_alignbit_b32 v60, v13, v16, 16 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v19, 16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v59 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_alignbit_b32 v59, v22, v23, 16 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_alignbit_b32 v23, v23, v27, 16 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_alignbit_b32 v58, v29, v35, 16 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 40, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v3, v7, v3, 16 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 44, v0 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_alignbit_b32 v57, v32, v35, 16 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 48, v0 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_alignbit_b32 v33, v34, v33, 16 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 52, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v35, 16 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 56, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 60, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v52, 16 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 64, v0 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v18, v18, v54, 16 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x48, v0 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_alignbit_b32 v17, v36, v17, 16 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x4c, v0 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_alignbit_b32 v26, v26, v42, 16 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x50, v0 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v25, 16 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x54, v0 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_alignbit_b32 v37, v37, v43, 16 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x58, v0 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v12, v14, v12, 16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x5c, v0 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v20, v20, v41, 16 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x60, v0 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_alignbit_b32 v24, v28, v24, 16 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x64, v0 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_alignbit_b32 v30, v30, v40, 16 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x68, v0 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_alignbit_b32 v38, v39, v38, 16 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x6c, v0 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_alignbit_b32 v48, v48, v55, 16 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_alignbit_b32 v49, v50, v49, 16 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x74, v0 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_alignbit_b32 v51, v51, v53, 16 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v63, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v128i8_to_v64bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:392 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v24 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v30 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v1 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:328 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v1 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:360 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:384 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:344 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v54 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v56 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v5, v6 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v5, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v5, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v5, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v5, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v5, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v5, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v27, v5, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v8, v5, v8 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v30, v5, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v9, v5, v26 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v29, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v42 +; SI-NEXT: v_or_b32_e32 v7, v5, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v51, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v19, v57, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v16, v13, v61 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v23, v62, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v18, v3, v17 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v25 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_or_b32_e32 v11, v41, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v47 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_or_b32_e32 v15, v5, v44 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v59 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_or_b32_e32 v5, v45, v5 +; SI-NEXT: v_or_b32_e32 v17, v63, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: .LBB88_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; SI-NEXT: v_or_b32_e32 v4, v63, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v56 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v43 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_or_b32_e32 v5, v62, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v54 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_or_b32_e32 v6, v57, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v6, v44, v6 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_or_b32_e32 v10, v45, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v31 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_or_b32_e32 v11, v41, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v21 +; SI-NEXT: v_or_b32_e32 v11, v38, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_or_b32_e32 v12, v51, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v16, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_or_b32_e32 v7, v26, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_or_b32_e32 v13, v29, v13 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v13, v13, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v14, v7 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_or_b32_e32 v14, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v15, v9 +; SI-NEXT: v_or_b32_e32 v15, v9, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: v_or_b32_e32 v16, v9, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v17, v8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v18, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v19, v8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v20, v8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v21, v7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v22, v7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v23, v7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v24, v7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v25, v7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v26, v7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v27, v7, v1 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v28, v7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v28, v23 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v27 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, s7, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v31, vcc, s7, v31 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v32 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v2 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: .LBB88_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v64bf16: ; VI: ; %bb.0: @@ -75258,7 +153590,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB44_2 +; VI-NEXT: s_cbranch_execz .LBB88_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload @@ -75740,9 +154072,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: .LBB44_2: ; %Flow +; VI-NEXT: .LBB88_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB44_4 +; VI-NEXT: s_cbranch_execz .LBB88_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload @@ -76129,7 +154461,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v39, 0x300, v48 ; VI-NEXT: v_or_b32_e32 v21, v39, v21 ; VI-NEXT: v_or_b32_e32 v31, v31, v54 -; VI-NEXT: .LBB44_4: ; %end +; VI-NEXT: .LBB88_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -76505,7 +154837,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB44_2 +; GFX9-NEXT: s_cbranch_execz .LBB88_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload @@ -76988,9 +155320,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: .LBB44_2: ; %Flow +; GFX9-NEXT: .LBB88_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB44_4 +; GFX9-NEXT: s_cbranch_execz .LBB88_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload @@ -77381,7 +155713,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v26, v37, v26, s6 ; GFX9-NEXT: v_perm_b32 v27, v36, v27, s6 ; GFX9-NEXT: v_perm_b32 v28, v35, v28, s6 -; GFX9-NEXT: .LBB44_4: ; %end +; GFX9-NEXT: .LBB88_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -77626,15 +155958,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB88_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_4 -; GFX11-TRUE16-NEXT: .LBB44_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB88_4 +; GFX11-TRUE16-NEXT: .LBB88_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB44_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB88_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.l @@ -77893,8 +156225,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 -; GFX11-TRUE16-NEXT: .LBB44_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB88_2 +; GFX11-TRUE16-NEXT: .LBB88_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.l, 3 @@ -78168,923 +156500,6586 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:516 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456 -; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:392 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v30 :: v_dual_mov_b32 v54, v24 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v55, v28 :: v_dual_mov_b32 v52, v26 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v22 :: v_dual_mov_b32 v53, v20 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v51, v16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v14 :: v_dual_mov_b32 v49, v12 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v39, v4 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v150, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:152 +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:516 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:392 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v30 :: v_dual_mov_b32 v54, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v55, v28 :: v_dual_mov_b32 v52, v26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v22 :: v_dual_mov_b32 v53, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v51, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v39, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v150, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:152 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:20 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v29 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v182 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v40 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v43 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v44 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v45 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v46 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v47 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v56 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v58 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v59 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v44, 8, v60 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v61 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v62 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v63 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v72 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v73 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v74 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v75 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v76 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v77 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v78 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v79 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v89 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v90 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v76, 8, v95 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v104 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v105 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v94 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB88_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v51 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v161 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v167 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v7, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v9, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v11, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v150 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v56 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v10, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v12, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v14, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v16, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v148 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v58 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v73 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v13, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v15, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v17, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v19, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v21, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v75 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v78 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v77 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v79 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v89 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v92 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v91 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v20, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v104 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v95 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v105 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v94 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v108 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v107 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v110 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v109 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v23, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v29, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v31, v30, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v111 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v106 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v122 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v121 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v123 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v120 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v125 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v124 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v126 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v127 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v32, v31, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v34, v33, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v36, v35, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 +; GFX11-FAKE16-NEXT: .LBB88_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB88_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v131, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v116, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v35, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v126, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v127, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v125, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v124, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v33, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v98, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v116, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v98, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v112, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v99, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v103, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v120, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v122, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v121, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v111, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v81, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v81, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v101, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v97, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v106, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v110, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v109, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v108, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v94, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v54, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v52, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v53, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v50, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v49, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v49, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v48, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v32, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v39, 0x300, v1 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v33, v38, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v32, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v34, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v52, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 +; GFX11-FAKE16-NEXT: .LBB88_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:580 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v128i8_to_v64bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 +; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane +; SI-NEXT: s_mov_b32 s72, s21 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v43, s19, 0 +; SI-NEXT: v_writelane_b32 v43, s18, 1 +; SI-NEXT: v_writelane_b32 v43, s17, 2 +; SI-NEXT: v_writelane_b32 v43, s16, 3 +; SI-NEXT: s_mov_b32 s60, s24 +; SI-NEXT: v_writelane_b32 v41, s30, 0 +; SI-NEXT: v_writelane_b32 v41, s31, 1 +; SI-NEXT: v_writelane_b32 v41, s34, 2 +; SI-NEXT: v_writelane_b32 v41, s35, 3 +; SI-NEXT: v_writelane_b32 v41, s36, 4 +; SI-NEXT: v_writelane_b32 v41, s37, 5 +; SI-NEXT: v_writelane_b32 v41, s38, 6 +; SI-NEXT: v_writelane_b32 v41, s39, 7 +; SI-NEXT: v_writelane_b32 v41, s48, 8 +; SI-NEXT: v_writelane_b32 v41, s49, 9 +; SI-NEXT: v_writelane_b32 v41, s50, 10 +; SI-NEXT: v_writelane_b32 v41, s51, 11 +; SI-NEXT: v_writelane_b32 v41, s52, 12 +; SI-NEXT: v_writelane_b32 v41, s53, 13 +; SI-NEXT: v_writelane_b32 v41, s54, 14 +; SI-NEXT: v_writelane_b32 v41, s55, 15 +; SI-NEXT: v_writelane_b32 v41, s64, 16 +; SI-NEXT: v_writelane_b32 v41, s65, 17 +; SI-NEXT: v_writelane_b32 v41, s66, 18 +; SI-NEXT: v_writelane_b32 v41, s67, 19 +; SI-NEXT: v_writelane_b32 v41, s68, 20 +; SI-NEXT: v_writelane_b32 v41, s69, 21 +; SI-NEXT: v_writelane_b32 v41, s70, 22 +; SI-NEXT: v_writelane_b32 v41, s71, 23 +; SI-NEXT: s_mov_b32 s77, s28 +; SI-NEXT: s_mov_b32 s76, s27 +; SI-NEXT: v_writelane_b32 v41, s80, 24 +; SI-NEXT: v_writelane_b32 v41, s81, 25 +; SI-NEXT: v_writelane_b32 v41, s82, 26 +; SI-NEXT: v_writelane_b32 v41, s83, 27 +; SI-NEXT: v_writelane_b32 v41, s84, 28 +; SI-NEXT: v_writelane_b32 v41, s85, 29 +; SI-NEXT: v_writelane_b32 v41, s86, 30 +; SI-NEXT: v_writelane_b32 v41, s87, 31 +; SI-NEXT: v_writelane_b32 v41, s96, 32 +; SI-NEXT: v_writelane_b32 v41, s97, 33 +; SI-NEXT: v_writelane_b32 v41, s98, 34 +; SI-NEXT: v_writelane_b32 v41, s99, 35 +; SI-NEXT: s_mov_b32 s79, s26 +; SI-NEXT: v_readfirstlane_b32 s38, v20 +; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s39, v19 +; SI-NEXT: v_writelane_b32 v42, s38, 0 +; SI-NEXT: v_readfirstlane_b32 s48, v25 +; SI-NEXT: v_writelane_b32 v42, s39, 1 +; SI-NEXT: v_readfirstlane_b32 s49, v26 +; SI-NEXT: v_writelane_b32 v42, s48, 2 +; SI-NEXT: v_readfirstlane_b32 s50, v24 +; SI-NEXT: v_writelane_b32 v42, s49, 3 +; SI-NEXT: v_readfirstlane_b32 s51, v23 +; SI-NEXT: v_writelane_b32 v42, s50, 4 +; SI-NEXT: v_readfirstlane_b32 s52, v29 +; SI-NEXT: v_writelane_b32 v42, s51, 5 +; SI-NEXT: v_readfirstlane_b32 s53, v30 +; SI-NEXT: v_writelane_b32 v42, s52, 6 +; SI-NEXT: v_readfirstlane_b32 s54, v28 +; SI-NEXT: v_writelane_b32 v42, s53, 7 +; SI-NEXT: v_readfirstlane_b32 s55, v27 +; SI-NEXT: v_writelane_b32 v42, s54, 8 +; SI-NEXT: v_writelane_b32 v42, s55, 9 +; SI-NEXT: v_readfirstlane_b32 s16, v1 +; SI-NEXT: v_readfirstlane_b32 s17, v2 +; SI-NEXT: v_readfirstlane_b32 s18, v5 +; SI-NEXT: v_readfirstlane_b32 s19, v6 +; SI-NEXT: v_readfirstlane_b32 s88, v4 +; SI-NEXT: v_readfirstlane_b32 s89, v3 +; SI-NEXT: v_readfirstlane_b32 s90, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s6, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_writelane_b32 v43, s4, 4 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 +; SI-NEXT: v_writelane_b32 v43, s4, 5 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_writelane_b32 v43, s4, 6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v43, s4, 7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_writelane_b32 v43, s4, 8 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 +; SI-NEXT: v_writelane_b32 v43, s4, 9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v43, s4, 10 +; SI-NEXT: v_readfirstlane_b32 s91, v10 +; SI-NEXT: v_readfirstlane_b32 s92, v8 +; SI-NEXT: v_readfirstlane_b32 s93, v7 +; SI-NEXT: v_readfirstlane_b32 s94, v13 +; SI-NEXT: v_readfirstlane_b32 s95, v14 +; SI-NEXT: v_readfirstlane_b32 s30, v17 +; SI-NEXT: v_readfirstlane_b32 s31, v18 +; SI-NEXT: v_readfirstlane_b32 s34, v16 +; SI-NEXT: v_readfirstlane_b32 s35, v15 +; SI-NEXT: v_readfirstlane_b32 s36, v21 +; SI-NEXT: v_readfirstlane_b32 s37, v22 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_writelane_b32 v43, s4, 11 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_writelane_b32 v43, s4, 12 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: v_writelane_b32 v43, s4, 13 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: v_writelane_b32 v43, s4, 14 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: v_writelane_b32 v43, s4, 15 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s4, v51 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s75, v32 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s61, v33 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 +; SI-NEXT: v_writelane_b32 v43, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s43, v34 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s40, v35 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s63, v37 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204 +; SI-NEXT: v_writelane_b32 v43, s4, 17 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s59, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s42, v38 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s73, v39 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s21, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s57, v49 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s13, v50 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s45, v51 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s47, v32 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s24, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s78, v34 +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v43, s4, 18 +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_writelane_b32 v43, s4, 19 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_writelane_b32 v43, s4, 20 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_writelane_b32 v43, s4, 21 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v43, s4, 22 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_writelane_b32 v43, s4, 23 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: v_writelane_b32 v43, s4, 24 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: v_writelane_b32 v43, s4, 25 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: v_writelane_b32 v43, s4, 26 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s4, v51 +; SI-NEXT: v_writelane_b32 v43, s4, 27 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 +; SI-NEXT: v_writelane_b32 v43, s4, 28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s4, v52 +; SI-NEXT: v_writelane_b32 v43, s4, 29 +; SI-NEXT: v_readfirstlane_b32 s4, v53 +; SI-NEXT: v_writelane_b32 v43, s4, 30 +; SI-NEXT: v_readfirstlane_b32 s4, v54 +; SI-NEXT: v_writelane_b32 v43, s4, 31 +; SI-NEXT: v_readfirstlane_b32 s4, v55 +; SI-NEXT: v_writelane_b32 v43, s4, 32 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s4, v40 +; SI-NEXT: v_writelane_b32 v43, s4, 33 +; SI-NEXT: v_writelane_b32 v43, s22, 34 +; SI-NEXT: v_writelane_b32 v43, s23, 35 +; SI-NEXT: v_writelane_b32 v43, s72, 36 +; SI-NEXT: v_writelane_b32 v43, s20, 37 +; SI-NEXT: v_writelane_b32 v43, s79, 38 +; SI-NEXT: v_writelane_b32 v43, s76, 39 +; SI-NEXT: v_writelane_b32 v43, s25, 40 +; SI-NEXT: v_writelane_b32 v43, s60, 41 +; SI-NEXT: v_writelane_b32 v43, s29, 42 +; SI-NEXT: v_writelane_b32 v43, s77, 43 +; SI-NEXT: v_writelane_b32 v43, s16, 44 +; SI-NEXT: v_writelane_b32 v43, s17, 45 +; SI-NEXT: v_writelane_b32 v43, s18, 46 +; SI-NEXT: v_writelane_b32 v43, s19, 47 +; SI-NEXT: v_writelane_b32 v43, s88, 48 +; SI-NEXT: v_writelane_b32 v43, s89, 49 +; SI-NEXT: v_writelane_b32 v43, s90, 50 +; SI-NEXT: v_writelane_b32 v43, s91, 51 +; SI-NEXT: v_writelane_b32 v43, s92, 52 +; SI-NEXT: v_writelane_b32 v43, s93, 53 +; SI-NEXT: v_writelane_b32 v43, s94, 54 +; SI-NEXT: v_writelane_b32 v43, s95, 55 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s62, v33 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s10, v34 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s66, v35 +; SI-NEXT: v_readfirstlane_b32 s28, v31 +; SI-NEXT: v_readfirstlane_b32 s27, v32 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s58, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s69, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s14, v38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s68, v39 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s11, v49 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s70, v50 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s71, v51 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 vcc_lo, v12 +; SI-NEXT: v_readfirstlane_b32 vcc_hi, v11 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 56 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 57 +; SI-NEXT: v_writelane_b32 v43, s30, 58 +; SI-NEXT: v_writelane_b32 v43, s31, 59 +; SI-NEXT: v_writelane_b32 v43, s34, 60 +; SI-NEXT: v_writelane_b32 v43, s35, 61 +; SI-NEXT: v_writelane_b32 v43, s36, 62 +; SI-NEXT: v_writelane_b32 v43, s37, 63 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s74, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s46, v32 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s96, v33 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s98, v34 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s41, v35 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s56, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s87, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s99, v38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s81, v39 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s26, v48 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s83, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s82, v50 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s7, v51 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s15, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s97, v32 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s44, v33 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s9, v34 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s80, v35 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s86, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s85, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s8, v38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s12, v39 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s65, v48 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s64, v49 +; SI-NEXT: v_writelane_b32 v42, s64, 10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s67, v50 +; SI-NEXT: v_writelane_b32 v42, s65, 11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s84, v51 +; SI-NEXT: v_writelane_b32 v42, s67, 12 +; SI-NEXT: v_writelane_b32 v42, s84, 13 +; SI-NEXT: v_writelane_b32 v42, s85, 14 +; SI-NEXT: v_writelane_b32 v42, s86, 15 +; SI-NEXT: v_writelane_b32 v42, s87, 16 +; SI-NEXT: v_writelane_b32 v42, s8, 17 +; SI-NEXT: v_writelane_b32 v42, s99, 18 +; SI-NEXT: v_writelane_b32 v42, s12, 19 +; SI-NEXT: v_writelane_b32 v42, s44, 20 +; SI-NEXT: v_writelane_b32 v42, s97, 21 +; SI-NEXT: v_writelane_b32 v42, s83, 22 +; SI-NEXT: v_writelane_b32 v42, s82, 23 +; SI-NEXT: v_writelane_b32 v42, s98, 24 +; SI-NEXT: v_writelane_b32 v42, s96, 25 +; SI-NEXT: v_writelane_b32 v42, s81, 26 +; SI-NEXT: v_writelane_b32 v42, s9, 27 +; SI-NEXT: v_writelane_b32 v42, s41, 28 +; SI-NEXT: v_writelane_b32 v42, s80, 29 +; SI-NEXT: v_writelane_b32 v42, s7, 30 +; SI-NEXT: v_writelane_b32 v42, s56, 31 +; SI-NEXT: v_writelane_b32 v42, s26, 32 +; SI-NEXT: v_writelane_b32 v42, s15, 33 +; SI-NEXT: v_writelane_b32 v42, s14, 34 +; SI-NEXT: v_writelane_b32 v42, s69, 35 +; SI-NEXT: v_writelane_b32 v42, s71, 36 +; SI-NEXT: v_writelane_b32 v42, s70, 37 +; SI-NEXT: v_writelane_b32 v42, s68, 38 +; SI-NEXT: v_writelane_b32 v42, s74, 39 +; SI-NEXT: v_writelane_b32 v42, s46, 40 +; SI-NEXT: v_writelane_b32 v42, s11, 41 +; SI-NEXT: v_writelane_b32 v42, s10, 42 +; SI-NEXT: v_writelane_b32 v42, s62, 43 +; SI-NEXT: v_writelane_b32 v42, s66, 44 +; SI-NEXT: v_writelane_b32 v42, s58, 45 +; SI-NEXT: v_writelane_b32 v42, s28, 46 +; SI-NEXT: v_writelane_b32 v42, s27, 47 +; SI-NEXT: v_writelane_b32 v42, s78, 48 +; SI-NEXT: v_writelane_b32 v42, s24, 49 +; SI-NEXT: s_cbranch_scc0 .LBB89_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_readlane_b32 s4, v43, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s5, v43, 2 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_writelane_b32 v42, s4, 56 +; SI-NEXT: v_readlane_b32 s4, v43, 1 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s5, v43, 0 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_writelane_b32 v42, s4, 57 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_mov_b32 s22, s6 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: v_writelane_b32 v42, s4, 58 +; SI-NEXT: s_or_b32 s4, s6, s5 +; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s25, 24 +; SI-NEXT: v_writelane_b32 v42, s4, 59 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_writelane_b32 v42, s5, 60 +; SI-NEXT: s_and_b32 s5, s79, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s76, 24 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_writelane_b32 v42, s5, 61 +; SI-NEXT: s_and_b32 s5, s77, 0xff +; SI-NEXT: s_lshl_b32 s6, s29, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s16, s17, 24 +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: v_writelane_b32 v42, s6, 62 +; SI-NEXT: s_and_b32 s6, s89, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s16, s88, 24 +; SI-NEXT: s_mov_b32 s4, s47 +; SI-NEXT: s_or_b32 s47, s16, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s16, s19, 24 +; SI-NEXT: s_or_b32 s25, s16, s6 +; SI-NEXT: s_and_b32 s6, s93, 0xff +; SI-NEXT: s_lshl_b32 s16, s92, 8 +; SI-NEXT: s_or_b32 s6, s6, s16 +; SI-NEXT: s_and_b32 s16, s90, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, s91, 24 +; SI-NEXT: s_or_b32 s92, s17, s16 +; SI-NEXT: s_and_b32 s16, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, vcc_lo, 24 +; SI-NEXT: s_or_b32 s76, s17, s16 +; SI-NEXT: s_and_b32 s16, s94, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, s95, 24 +; SI-NEXT: s_or_b32 s91, s17, s16 +; SI-NEXT: s_and_b32 s16, s35, 0xff +; SI-NEXT: s_lshl_b32 s17, s34, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s30, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s31, 24 +; SI-NEXT: s_or_b32 s77, s18, s17 +; SI-NEXT: s_and_b32 s17, s39, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s38, 24 +; SI-NEXT: s_or_b32 s79, s18, s17 +; SI-NEXT: s_and_b32 s17, s36, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s37, 24 +; SI-NEXT: s_or_b32 s93, s18, s17 +; SI-NEXT: s_and_b32 s17, s51, 0xff +; SI-NEXT: s_lshl_b32 s18, s50, 8 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s48, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s49, 24 +; SI-NEXT: s_or_b32 s89, s19, s18 +; SI-NEXT: s_and_b32 s18, s55, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s54, 24 +; SI-NEXT: s_or_b32 s31, s19, s18 +; SI-NEXT: s_and_b32 s18, s52, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s53, 24 +; SI-NEXT: s_or_b32 s94, s19, s18 +; SI-NEXT: s_and_b32 s18, s84, 0xff +; SI-NEXT: s_lshl_b32 s19, s67, 8 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s64, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s65, 24 +; SI-NEXT: s_or_b32 s60, s20, s19 +; SI-NEXT: s_and_b32 s19, s12, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s8, 24 +; SI-NEXT: s_or_b32 s8, s20, s19 +; SI-NEXT: s_and_b32 s19, s85, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s86, 24 +; SI-NEXT: s_or_b32 s12, s20, s19 +; SI-NEXT: s_and_b32 s19, s80, 0xff +; SI-NEXT: s_lshl_b32 s20, s9, 8 +; SI-NEXT: s_or_b32 vcc_lo, s19, s20 +; SI-NEXT: s_and_b32 s19, s44, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s97, 24 +; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: s_and_b32 s19, s15, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_and_b32 s19, s82, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s83, 24 +; SI-NEXT: s_or_b32 s23, s20, s19 +; SI-NEXT: s_and_b32 s19, s26, 0xff +; SI-NEXT: s_lshl_b32 s20, s81, 8 +; SI-NEXT: s_or_b32 vcc_hi, s19, s20 +; SI-NEXT: s_and_b32 s19, s99, 0xff +; SI-NEXT: v_writelane_b32 v42, s9, 50 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s87, 24 +; SI-NEXT: v_writelane_b32 v42, s7, 51 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_and_b32 s19, s56, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s41, 24 +; SI-NEXT: v_writelane_b32 v42, s7, 52 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_and_b32 s19, s98, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s96, 24 +; SI-NEXT: v_writelane_b32 v42, s7, 54 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_and_b32 s19, s46, 0xff +; SI-NEXT: s_lshl_b32 s20, s74, 8 +; SI-NEXT: s_or_b32 s84, s19, s20 +; SI-NEXT: s_and_b32 s19, s71, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s70, 24 +; SI-NEXT: s_or_b32 s72, s20, s19 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s68, 24 +; SI-NEXT: v_writelane_b32 v42, s7, 53 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_and_b32 s19, s14, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s69, 24 +; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: s_and_b32 s19, s58, 0xff +; SI-NEXT: s_lshl_b32 s20, s66, 8 +; SI-NEXT: s_or_b32 s85, s19, s20 +; SI-NEXT: s_and_b32 s19, s10, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s62, 24 +; SI-NEXT: s_or_b32 s49, s20, s19 +; SI-NEXT: s_and_b32 s19, s27, 0xff +; SI-NEXT: v_writelane_b32 v42, s9, 55 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s28, 24 +; SI-NEXT: v_readlane_b32 s9, v43, 33 +; SI-NEXT: s_or_b32 s50, s20, s19 +; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 32 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v43, 31 +; SI-NEXT: s_or_b32 s51, s20, s19 +; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 30 +; SI-NEXT: s_lshl_b32 s20, s9, 8 +; SI-NEXT: v_readlane_b32 s9, v43, 29 +; SI-NEXT: s_or_b32 s86, s19, s20 +; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 28 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v43, 27 +; SI-NEXT: s_or_b32 s52, s20, s19 +; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 26 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v43, 25 +; SI-NEXT: s_or_b32 s53, s20, s19 +; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 24 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v43, 23 +; SI-NEXT: s_or_b32 s54, s20, s19 +; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 22 +; SI-NEXT: s_lshl_b32 s20, s9, 8 +; SI-NEXT: v_readlane_b32 s9, v43, 21 +; SI-NEXT: s_or_b32 s87, s19, s20 +; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 20 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v43, 19 +; SI-NEXT: s_or_b32 s55, s20, s19 +; SI-NEXT: s_mov_b32 s58, s9 +; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 18 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s9, 24 +; SI-NEXT: s_or_b32 s64, s20, s19 +; SI-NEXT: s_and_b32 s19, s78, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s24, 24 +; SI-NEXT: s_or_b32 s65, s20, s19 +; SI-NEXT: s_and_b32 s19, s4, 0xff +; SI-NEXT: s_lshl_b32 s20, s45, 8 +; SI-NEXT: s_or_b32 s26, s19, s20 +; SI-NEXT: s_and_b32 s19, s13, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s57, 24 +; SI-NEXT: s_or_b32 s66, s20, s19 +; SI-NEXT: s_and_b32 s19, s21, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s73, 24 +; SI-NEXT: s_or_b32 s67, s20, s19 +; SI-NEXT: s_and_b32 s19, s42, 0xff +; SI-NEXT: v_readlane_b32 s88, v43, 17 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s59, 24 +; SI-NEXT: s_or_b32 s68, s20, s19 +; SI-NEXT: s_and_b32 s19, s63, 0xff +; SI-NEXT: s_lshl_b32 s20, s88, 8 +; SI-NEXT: s_or_b32 s27, s19, s20 +; SI-NEXT: s_and_b32 s19, s40, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s43, 24 +; SI-NEXT: s_or_b32 s69, s20, s19 +; SI-NEXT: s_and_b32 s19, s61, 0xff +; SI-NEXT: s_mov_b32 s39, s57 +; SI-NEXT: s_mov_b32 s57, s7 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s75, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 16 +; SI-NEXT: s_or_b32 s70, s20, s19 +; SI-NEXT: s_mov_b32 s10, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 15 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_mov_b32 s71, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 14 +; SI-NEXT: s_or_b32 s62, s20, s19 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 13 +; SI-NEXT: s_mov_b32 s41, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 12 +; SI-NEXT: s_or_b32 s29, s19, s20 +; SI-NEXT: s_mov_b32 s14, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 11 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 10 +; SI-NEXT: s_or_b32 s80, s20, s19 +; SI-NEXT: s_mov_b32 s56, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 9 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_mov_b32 s81, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 8 +; SI-NEXT: s_or_b32 s11, s20, s19 +; SI-NEXT: s_mov_b32 s82, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 7 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_mov_b32 s96, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 6 +; SI-NEXT: s_mov_b32 s36, s63 +; SI-NEXT: s_mov_b32 s63, s93 +; SI-NEXT: s_mov_b32 s93, s61 +; SI-NEXT: s_mov_b32 s61, s91 +; SI-NEXT: s_mov_b32 s91, s75 +; SI-NEXT: s_mov_b32 s75, s92 +; SI-NEXT: s_or_b32 s92, s20, s19 +; SI-NEXT: s_mov_b32 s98, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 5 +; SI-NEXT: s_mov_b32 s44, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 4 +; SI-NEXT: s_mov_b32 s48, s13 +; SI-NEXT: s_mov_b32 s13, s94 +; SI-NEXT: s_mov_b32 s94, s21 +; SI-NEXT: s_or_b32 s21, s19, s20 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: s_mov_b32 s95, s4 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s22, 24 +; SI-NEXT: v_readlane_b32 s4, v42, 58 +; SI-NEXT: s_mov_b32 s46, s45 +; SI-NEXT: s_mov_b32 s34, s73 +; SI-NEXT: s_mov_b32 s73, s12 +; SI-NEXT: s_mov_b32 s37, s42 +; SI-NEXT: s_mov_b32 s38, s59 +; SI-NEXT: s_mov_b32 s59, s8 +; SI-NEXT: s_mov_b32 s30, s88 +; SI-NEXT: s_mov_b32 s88, s31 +; SI-NEXT: s_mov_b32 s78, s40 +; SI-NEXT: s_mov_b32 s31, s43 +; SI-NEXT: s_mov_b32 s12, s7 +; SI-NEXT: s_mov_b32 s7, s22 +; SI-NEXT: s_or_b32 s83, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s4, 16 +; SI-NEXT: s_lshl_b32 s74, s5, 16 +; SI-NEXT: s_lshl_b32 s22, s6, 16 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s19, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s17, vcc_lo, 16 +; SI-NEXT: s_lshl_b32 s6, vcc_hi, 16 +; SI-NEXT: s_lshl_b32 s99, s84, 16 +; SI-NEXT: s_lshl_b32 s8, s85, 16 +; SI-NEXT: s_lshl_b32 s97, s86, 16 +; SI-NEXT: s_lshl_b32 s28, s87, 16 +; SI-NEXT: s_lshl_b32 s87, s26, 16 +; SI-NEXT: v_readlane_b32 s26, v42, 56 +; SI-NEXT: s_lshl_b32 s86, s27, 16 +; SI-NEXT: v_readlane_b32 s27, v42, 57 +; SI-NEXT: v_readlane_b32 s35, v42, 61 +; SI-NEXT: s_lshl_b32 s85, s29, 16 +; SI-NEXT: v_readlane_b32 s29, v42, 60 +; SI-NEXT: v_readlane_b32 s24, v42, 59 +; SI-NEXT: v_readlane_b32 s90, v42, 62 +; SI-NEXT: s_lshl_b32 s84, s21, 16 +; SI-NEXT: s_mov_b32 s21, s47 +; SI-NEXT: s_cbranch_execnz .LBB89_3 +; SI-NEXT: .LBB89_2: ; %cmp.true +; SI-NEXT: s_add_i32 s4, s98, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_add_i32 s6, s12, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s7, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s5, s56, 3 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: s_lshl_b32 s6, s81, 8 +; SI-NEXT: s_add_i32 s16, s82, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s6, s96, 24 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_or_b32 s6, s6, s16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s6, s15, 3 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s16, s41, 8 +; SI-NEXT: s_add_i32 s17, s14, 3 +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: s_lshl_b32 s16, s9, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: s_add_i32 s16, s93, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s91, 8 +; SI-NEXT: s_add_i32 s18, s10, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s17, s71, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s17, s36, 3 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: s_lshl_b32 s18, s30, 8 +; SI-NEXT: s_add_i32 s19, s78, 3 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s19, s19, 0xff +; SI-NEXT: s_lshl_b32 s18, s31, 24 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_addk_i32 s17, 0x300 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_add_i32 s18, s94, 3 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s19, s34, 8 +; SI-NEXT: s_add_i32 s20, s37, 3 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s19, s38, 24 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_addk_i32 s18, 0x300 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_add_i32 s19, s95, 3 +; SI-NEXT: s_and_b32 s19, s19, 0xff +; SI-NEXT: s_lshl_b32 s20, s46, 8 +; SI-NEXT: s_add_i32 s22, s48, 3 +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_lshl_b32 s20, s39, 24 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_addk_i32 s19, 0x300 +; SI-NEXT: s_or_b32 s20, s20, s22 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: s_add_i32 s20, s58, 3 +; SI-NEXT: v_readlane_b32 s7, v43, 18 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s22, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v42, 49 +; SI-NEXT: s_or_b32 s20, s22, s20 +; SI-NEXT: s_lshl_b32 s22, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v42, 48 +; SI-NEXT: s_add_i32 s23, s7, 3 +; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: v_readlane_b32 s7, v43, 23 +; SI-NEXT: s_or_b32 s20, s22, s20 +; SI-NEXT: s_add_i32 s22, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v43, 22 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_lshl_b32 s23, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 20 +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: s_lshl_b32 s23, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 21 +; SI-NEXT: s_add_i32 s60, s7, 3 +; SI-NEXT: s_and_b32 s60, s60, 0xff +; SI-NEXT: s_lshl_b32 s60, s60, 16 +; SI-NEXT: s_addk_i32 s22, 0x300 +; SI-NEXT: s_or_b32 s23, s23, s60 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: v_readlane_b32 s7, v43, 27 +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: s_add_i32 s23, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v43, 26 +; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: s_lshl_b32 s60, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 24 +; SI-NEXT: s_or_b32 s23, s60, s23 +; SI-NEXT: s_lshl_b32 s60, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 25 +; SI-NEXT: s_add_i32 s61, s7, 3 +; SI-NEXT: s_and_b32 s61, s61, 0xff +; SI-NEXT: s_lshl_b32 s61, s61, 16 +; SI-NEXT: s_addk_i32 s23, 0x300 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: v_readlane_b32 s7, v43, 31 +; SI-NEXT: s_or_b32 s23, s60, s23 +; SI-NEXT: s_add_i32 s60, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v43, 30 +; SI-NEXT: s_and_b32 s60, s60, 0xff +; SI-NEXT: s_lshl_b32 s61, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 28 +; SI-NEXT: s_or_b32 s60, s61, s60 +; SI-NEXT: s_lshl_b32 s61, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 29 +; SI-NEXT: s_add_i32 s62, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 47 +; SI-NEXT: s_and_b32 s62, s62, 0xff +; SI-NEXT: s_add_i32 s59, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 46 +; SI-NEXT: s_lshl_b32 s62, s62, 16 +; SI-NEXT: s_addk_i32 s60, 0x300 +; SI-NEXT: s_and_b32 s59, s59, 0xff +; SI-NEXT: s_lshl_b32 s58, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 32 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s60, s60, 0xffff +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_lshl_b32 s59, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 33 +; SI-NEXT: s_or_b32 s60, s61, s60 +; SI-NEXT: s_add_i32 s61, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 45 +; SI-NEXT: s_add_i32 s57, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 44 +; SI-NEXT: s_lshl_b32 s56, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v42, 43 +; SI-NEXT: s_lshl_b32 s47, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v42, 42 +; SI-NEXT: s_add_i32 s46, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 41 +; SI-NEXT: s_add_i32 s45, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 38 +; SI-NEXT: s_lshl_b32 s42, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v42, 35 +; SI-NEXT: s_lshl_b32 s15, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v42, 34 +; SI-NEXT: s_and_b32 s45, s45, 0xff +; SI-NEXT: s_add_i32 s14, s7, 3 +; SI-NEXT: s_or_b32 s42, s42, s45 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_addk_i32 s42, 0x300 +; SI-NEXT: v_readlane_b32 s7, v42, 40 +; SI-NEXT: s_and_b32 s57, s57, 0xff +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s42, 0xffff +; SI-NEXT: s_add_i32 s44, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 39 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_or_b32 s57, s14, s15 +; SI-NEXT: s_and_b32 s14, s44, 0xff +; SI-NEXT: s_lshl_b32 s15, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v42, 37 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_lshl_b32 s15, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v42, 36 +; SI-NEXT: s_add_i32 s40, s7, 3 +; SI-NEXT: s_and_b32 s61, s61, 0xff +; SI-NEXT: s_and_b32 s40, s40, 0xff +; SI-NEXT: s_lshl_b32 s61, s61, 16 +; SI-NEXT: s_addk_i32 s58, 0x300 +; SI-NEXT: s_lshl_b32 s40, s40, 16 +; SI-NEXT: s_addk_i32 s14, 0x300 +; SI-NEXT: s_or_b32 s59, s59, s61 +; SI-NEXT: s_and_b32 s58, s58, 0xffff +; SI-NEXT: s_or_b32 s15, s15, s40 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s58, s59, s58 +; SI-NEXT: s_or_b32 s59, s15, s14 +; SI-NEXT: s_add_i32 s14, s6, 0x3000000 +; SI-NEXT: v_readlane_b32 s6, v42, 31 +; SI-NEXT: s_add_i32 s11, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 28 +; SI-NEXT: s_and_b32 s6, s11, 0xff +; SI-NEXT: s_lshl_b32 s8, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v42, 25 +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_lshl_b32 s8, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v42, 24 +; SI-NEXT: s_add_i32 s24, s7, 3 +; SI-NEXT: s_and_b32 s11, s24, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: s_or_b32 s8, s8, s6 +; SI-NEXT: v_readlane_b32 s6, v42, 32 +; SI-NEXT: s_add_i32 s12, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 26 +; SI-NEXT: s_and_b32 s6, s12, 0xff +; SI-NEXT: s_lshl_b32 s11, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v42, 16 +; SI-NEXT: s_or_b32 s6, s11, s6 +; SI-NEXT: s_lshl_b32 s11, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v42, 18 +; SI-NEXT: s_add_i32 s12, s7, 3 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: v_readlane_b32 s7, v42, 33 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_add_i32 s13, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 30 +; SI-NEXT: s_or_b32 s6, s11, s6 +; SI-NEXT: s_and_b32 s11, s13, 0xff +; SI-NEXT: s_lshl_b32 s10, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v42, 22 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_lshl_b32 s11, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v42, 23 +; SI-NEXT: s_add_i32 s25, s7, 3 +; SI-NEXT: s_and_b32 s12, s25, 0xff +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: v_readlane_b32 s7, v42, 29 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s9, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 27 +; SI-NEXT: v_readlane_b32 s11, v42, 20 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: v_readlane_b32 s9, v42, 21 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: v_readlane_b32 s9, v42, 19 +; SI-NEXT: s_add_i32 s21, s9, 3 +; SI-NEXT: v_readlane_b32 s11, v42, 17 +; SI-NEXT: v_readlane_b32 s12, v42, 14 +; SI-NEXT: s_and_b32 s9, s21, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 8 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_readlane_b32 s11, v42, 15 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_lshl_b32 s11, s11, 24 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_readlane_b32 s11, v42, 13 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: v_readlane_b32 s12, v42, 12 +; SI-NEXT: v_readlane_b32 s13, v42, 10 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_readlane_b32 s12, v42, 11 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_lshl_b32 s12, s12, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_readlane_b32 s12, v42, 9 +; SI-NEXT: s_add_i32 s15, s16, 0x3000000 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: v_readlane_b32 s13, v42, 8 +; SI-NEXT: v_readlane_b32 s16, v42, 6 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 8 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_readlane_b32 s13, v42, 7 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_addk_i32 s12, 0x300 +; SI-NEXT: s_lshl_b32 s13, s13, 24 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s13, s16 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_readlane_b32 s13, v42, 5 +; SI-NEXT: s_add_i32 s40, s17, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: v_readlane_b32 s16, v42, 4 +; SI-NEXT: v_readlane_b32 s17, v42, 2 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s13, s16, s13 +; SI-NEXT: v_readlane_b32 s16, v42, 3 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_lshl_b32 s16, s16, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s13, s16, s13 +; SI-NEXT: v_readlane_b32 s16, v42, 1 +; SI-NEXT: s_add_i32 s41, s18, 0x3000000 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 0 +; SI-NEXT: v_readlane_b32 s18, v43, 62 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 63 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s17, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 61 +; SI-NEXT: s_add_i32 s42, s19, 0x3000000 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s18, v43, 60 +; SI-NEXT: v_readlane_b32 s19, v43, 58 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 8 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s16, s18, s16 +; SI-NEXT: v_readlane_b32 s18, v43, 59 +; SI-NEXT: s_and_b32 s19, s19, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_or_b32 s16, s18, s16 +; SI-NEXT: v_readlane_b32 s18, v43, 57 +; SI-NEXT: s_add_i32 s43, s20, 0x3000000 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_readlane_b32 s19, v43, 56 +; SI-NEXT: v_readlane_b32 s20, v43, 54 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 8 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: v_readlane_b32 s19, v43, 55 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_addk_i32 s18, 0x300 +; SI-NEXT: s_lshl_b32 s19, s19, 24 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: v_readlane_b32 s19, v43, 53 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_readlane_b32 s20, v43, 52 +; SI-NEXT: v_readlane_b32 s21, v43, 50 +; SI-NEXT: s_and_b32 s19, s19, 0xff +; SI-NEXT: s_lshl_b32 s20, s20, 8 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: v_readlane_b32 s20, v43, 51 +; SI-NEXT: s_and_b32 s21, s21, 0xff +; SI-NEXT: s_addk_i32 s19, 0x300 +; SI-NEXT: s_lshl_b32 s20, s20, 24 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: v_readlane_b32 s20, v43, 49 +; SI-NEXT: s_add_i32 s44, s22, 0x3000000 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_readlane_b32 s21, v43, 48 +; SI-NEXT: v_readlane_b32 s22, v43, 46 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s21, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: v_readlane_b32 s21, v43, 47 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_lshl_b32 s21, s21, 24 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_add_i32 s21, s20, 0x3000000 +; SI-NEXT: v_readlane_b32 s20, v43, 43 +; SI-NEXT: s_add_i32 s45, s23, 0x3000000 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_readlane_b32 s22, v43, 42 +; SI-NEXT: v_readlane_b32 s23, v43, 44 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s22, s22, 8 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s20, s22, s20 +; SI-NEXT: v_readlane_b32 s22, v43, 45 +; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_lshl_b32 s22, s22, 24 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_or_b32 s20, s22, s20 +; SI-NEXT: s_add_i32 s22, s20, 0x3000000 +; SI-NEXT: v_readlane_b32 s20, v43, 41 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_readlane_b32 s23, v43, 40 +; SI-NEXT: v_readlane_b32 s24, v43, 38 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s23, s23, 8 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s20, s23, s20 +; SI-NEXT: v_readlane_b32 s23, v43, 39 +; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_lshl_b32 s23, s23, 24 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_or_b32 s20, s23, s20 +; SI-NEXT: s_add_i32 s23, s20, 0x3000000 +; SI-NEXT: v_readlane_b32 s20, v43, 37 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_readlane_b32 s24, v43, 36 +; SI-NEXT: v_readlane_b32 s25, v43, 34 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s24, s24, 8 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s20, s24, s20 +; SI-NEXT: v_readlane_b32 s24, v43, 35 +; SI-NEXT: s_and_b32 s25, s25, 0xff +; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_lshl_b32 s24, s24, 24 +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_and_b32 s46, s46, 0xff +; SI-NEXT: s_or_b32 s20, s24, s20 +; SI-NEXT: v_readlane_b32 s24, v43, 3 +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_addk_i32 s56, 0x300 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_readlane_b32 s25, v43, 2 +; SI-NEXT: v_readlane_b32 s26, v43, 1 +; SI-NEXT: s_or_b32 s46, s47, s46 +; SI-NEXT: s_and_b32 s47, s56, 0xffff +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_lshl_b32 s25, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s56, s46, s47 +; SI-NEXT: s_add_i32 s47, s58, 0x3000000 +; SI-NEXT: s_add_i32 s58, s59, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: v_readlane_b32 s25, v43, 0 +; SI-NEXT: s_and_b32 s26, s26, 0xff +; SI-NEXT: s_and_b32 s73, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s9, 16 +; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_addk_i32 s24, 0x300 +; SI-NEXT: s_lshl_b32 s25, s25, 24 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_and_b32 s63, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s79, s17, 16 +; SI-NEXT: v_writelane_b32 v42, s9, 50 +; SI-NEXT: s_lshl_b32 s17, s7, 16 +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: v_writelane_b32 v42, s7, 51 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: v_writelane_b32 v42, s7, 52 +; SI-NEXT: s_and_b32 s7, s8, 0xffff0000 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s46, s60, 0x3000000 +; SI-NEXT: s_add_i32 s56, s56, 0x3000000 +; SI-NEXT: s_add_i32 s57, s57, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s12, s12, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 0x3000000 +; SI-NEXT: s_add_i32 s16, s16, 0x3000000 +; SI-NEXT: s_add_i32 s18, s18, 0x3000000 +; SI-NEXT: s_add_i32 s19, s19, 0x3000000 +; SI-NEXT: s_add_i32 s20, s20, 0x3000000 +; SI-NEXT: s_add_i32 s24, s24, 0x3000000 +; SI-NEXT: v_writelane_b32 v42, s7, 53 +; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: s_and_b32 s27, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s24, 16 +; SI-NEXT: s_and_b32 s24, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s35, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s23, 16 +; SI-NEXT: s_and_b32 s90, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s22, 16 +; SI-NEXT: s_and_b32 s25, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s75, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s19, 16 +; SI-NEXT: s_and_b32 s61, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s18, 16 +; SI-NEXT: s_and_b32 s77, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_and_b32 s89, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s13, 16 +; SI-NEXT: s_and_b32 s13, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s12, 16 +; SI-NEXT: s_and_b32 s60, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s11, 16 +; SI-NEXT: s_and_b32 s23, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_writelane_b32 v42, s7, 54 +; SI-NEXT: s_and_b32 s72, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s99, s58, 16 +; SI-NEXT: s_and_b32 s7, s57, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s57, 16 +; SI-NEXT: s_and_b32 s49, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s56, 16 +; SI-NEXT: s_and_b32 s51, s47, 0xffff0000 +; SI-NEXT: s_lshl_b32 s50, s47, 16 +; SI-NEXT: s_and_b32 s52, s46, 0xffff0000 +; SI-NEXT: s_lshl_b32 s97, s46, 16 +; SI-NEXT: s_and_b32 s54, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s53, s45, 16 +; SI-NEXT: s_and_b32 s55, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s44, 16 +; SI-NEXT: s_and_b32 s65, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s43, 16 +; SI-NEXT: s_and_b32 s66, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s87, s42, 16 +; SI-NEXT: s_and_b32 s68, s41, 0xffff0000 +; SI-NEXT: s_lshl_b32 s67, s41, 16 +; SI-NEXT: s_and_b32 s69, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s86, s40, 16 +; SI-NEXT: s_and_b32 s62, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s70, s15, 16 +; SI-NEXT: s_and_b32 s80, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s85, s14, 16 +; SI-NEXT: s_and_b32 s92, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s5, 16 +; SI-NEXT: s_and_b32 s83, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s84, s4, 16 +; SI-NEXT: v_writelane_b32 v42, s7, 55 +; SI-NEXT: .LBB89_3: ; %end +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s26 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s29 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s21 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s79 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s59 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: v_readlane_b32 s4, v42, 50 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 +; SI-NEXT: v_readlane_b32 s4, v42, 51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: v_readlane_b32 s4, v42, 52 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: v_readlane_b32 s4, v42, 53 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_readlane_b32 s4, v42, 54 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s99 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: v_readlane_b32 s4, v42, 55 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s57 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s8 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s97 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s53 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s87 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s67 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s86 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s80 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s11 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s83 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s84 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v41, 35 +; SI-NEXT: v_readlane_b32 s98, v41, 34 +; SI-NEXT: v_readlane_b32 s97, v41, 33 +; SI-NEXT: v_readlane_b32 s96, v41, 32 +; SI-NEXT: v_readlane_b32 s87, v41, 31 +; SI-NEXT: v_readlane_b32 s86, v41, 30 +; SI-NEXT: v_readlane_b32 s85, v41, 29 +; SI-NEXT: v_readlane_b32 s84, v41, 28 +; SI-NEXT: v_readlane_b32 s83, v41, 27 +; SI-NEXT: v_readlane_b32 s82, v41, 26 +; SI-NEXT: v_readlane_b32 s81, v41, 25 +; SI-NEXT: v_readlane_b32 s80, v41, 24 +; SI-NEXT: v_readlane_b32 s71, v41, 23 +; SI-NEXT: v_readlane_b32 s70, v41, 22 +; SI-NEXT: v_readlane_b32 s69, v41, 21 +; SI-NEXT: v_readlane_b32 s68, v41, 20 +; SI-NEXT: v_readlane_b32 s67, v41, 19 +; SI-NEXT: v_readlane_b32 s66, v41, 18 +; SI-NEXT: v_readlane_b32 s65, v41, 17 +; SI-NEXT: v_readlane_b32 s64, v41, 16 +; SI-NEXT: v_readlane_b32 s55, v41, 15 +; SI-NEXT: v_readlane_b32 s54, v41, 14 +; SI-NEXT: v_readlane_b32 s53, v41, 13 +; SI-NEXT: v_readlane_b32 s52, v41, 12 +; SI-NEXT: v_readlane_b32 s51, v41, 11 +; SI-NEXT: v_readlane_b32 s50, v41, 10 +; SI-NEXT: v_readlane_b32 s49, v41, 9 +; SI-NEXT: v_readlane_b32 s48, v41, 8 +; SI-NEXT: v_readlane_b32 s39, v41, 7 +; SI-NEXT: v_readlane_b32 s38, v41, 6 +; SI-NEXT: v_readlane_b32 s37, v41, 5 +; SI-NEXT: v_readlane_b32 s36, v41, 4 +; SI-NEXT: v_readlane_b32 s35, v41, 3 +; SI-NEXT: v_readlane_b32 s34, v41, 2 +; SI-NEXT: v_readlane_b32 s31, v41, 1 +; SI-NEXT: v_readlane_b32 s30, v41, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB89_4: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; kill: killed $sgpr8 +; SI-NEXT: s_mov_b32 s7, s6 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr8 +; SI-NEXT: v_readlane_b32 s58, v43, 19 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_mov_b32 s95, s47 +; SI-NEXT: s_mov_b32 s94, s21 +; SI-NEXT: s_mov_b32 s93, s61 +; SI-NEXT: s_mov_b32 s34, s73 +; SI-NEXT: s_mov_b32 s91, s75 +; SI-NEXT: v_readlane_b32 s56, v43, 10 +; SI-NEXT: s_mov_b32 s36, s63 +; SI-NEXT: s_mov_b32 s38, s59 +; SI-NEXT: s_mov_b32 s37, s42 +; SI-NEXT: v_readlane_b32 s30, v43, 17 +; SI-NEXT: v_readlane_b32 s98, v43, 6 +; SI-NEXT: s_mov_b32 s46, s45 +; SI-NEXT: s_mov_b32 s31, s43 +; SI-NEXT: s_mov_b32 s78, s40 +; SI-NEXT: v_readlane_b32 s15, v43, 14 +; SI-NEXT: s_mov_b32 s39, s57 +; SI-NEXT: s_mov_b32 s48, s13 +; SI-NEXT: v_readlane_b32 s41, v43, 13 +; SI-NEXT: v_readlane_b32 s44, v43, 5 +; SI-NEXT: v_readlane_b32 s9, v43, 11 +; SI-NEXT: v_readlane_b32 s14, v43, 12 +; SI-NEXT: v_readlane_b32 s81, v43, 9 +; SI-NEXT: v_readlane_b32 s10, v43, 16 +; SI-NEXT: v_readlane_b32 s12, v43, 4 +; SI-NEXT: v_readlane_b32 s96, v43, 7 +; SI-NEXT: v_readlane_b32 s82, v43, 8 +; SI-NEXT: v_readlane_b32 s71, v43, 15 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; kill: killed $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr87 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: s_branch .LBB89_2 +; +; VI-LABEL: bitcast_v128i8_to_v64bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v46, v16 +; VI-NEXT: v_mov_b32_e32 v60, v5 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:56 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_mov_b32_e32 v62, v21 +; VI-NEXT: v_mov_b32_e32 v47, v17 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v18 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:88 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v19 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:204 +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v22 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v2 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:240 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:220 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:252 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:268 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:284 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:280 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:312 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:296 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:264 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB89_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v60 +; VI-NEXT: v_mov_b32_e32 v28, v26 +; VI-NEXT: v_mov_b32_e32 v26, v23 +; VI-NEXT: v_mov_b32_e32 v23, v5 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v31, v22 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v59, v10 +; VI-NEXT: v_mov_b32_e32 v58, v43 +; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_mov_b32_e32 v27, v14 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v3, v38 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v62 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v2, v63 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v33, v36 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v15 +; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v49 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v48, v32 +; VI-NEXT: v_mov_b32_e32 v34, v40 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v57 +; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v25, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v60 +; VI-NEXT: v_mov_b32_e32 v57, v61 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v61, v38 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v56 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v56, v45 +; VI-NEXT: v_mov_b32_e32 v51, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v53 +; VI-NEXT: v_mov_b32_e32 v55, v63 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v42, v44 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v46 +; VI-NEXT: v_mov_b32_e32 v39, v41 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v41, v52 +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v63, v54 +; VI-NEXT: v_mov_b32_e32 v54, v49 +; VI-NEXT: v_mov_b32_e32 v49, v53 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB89_3 +; VI-NEXT: .LBB89_2: +; VI-NEXT: v_mov_b32_e32 v34, v40 +; VI-NEXT: v_mov_b32_e32 v57, v61 +; VI-NEXT: v_mov_b32_e32 v48, v32 +; VI-NEXT: v_mov_b32_e32 v56, v45 +; VI-NEXT: v_mov_b32_e32 v51, v42 +; VI-NEXT: v_mov_b32_e32 v39, v41 +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v33, v36 +; VI-NEXT: v_mov_b32_e32 v36, v49 +; VI-NEXT: v_mov_b32_e32 v35, v63 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v53 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: v_mov_b32_e32 v52, v38 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: .LBB89_3: ; %Flow +; VI-NEXT: v_mov_b32_e32 v38, v48 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB89_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v47 +; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v53, v34 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s25, 8 +; VI-NEXT: s_and_b32 s10, s24, 0xff +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_and_b32 s12, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 24 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_and_b32 s11, s22, 0xff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s12 +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v39 +; VI-NEXT: s_and_b32 s10, s26, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s8, 0xffff +; VI-NEXT: s_lshl_b32 s8, s11, 16 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: v_and_b32_e32 v39, 0xff, v39 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s8, s10, 16 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v31, 24, v63 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s9, 0xffff +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v25 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x300, v28 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x300, v21 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v20 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v19 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v23 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v15 +; VI-NEXT: v_lshlrev_b32_e32 v33, 24, v59 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x300, v13 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v12 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v11 +; VI-NEXT: v_add_u32_e32 v43, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v24 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x300, v26 +; VI-NEXT: v_and_b32_e32 v26, 0xff, v43 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_or_b32_e32 v26, v33, v26 +; VI-NEXT: v_or_b32_sdwa v26, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 +; VI-NEXT: v_add_u32_e32 v44, vcc, 3, v51 +; VI-NEXT: v_and_b32_e32 v27, 0xff, v44 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v9 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v46 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x300, v7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x300, v6 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x300, v5 +; VI-NEXT: v_lshlrev_b32_e32 v32, 24, v49 +; VI-NEXT: v_or_b32_e32 v27, v32, v27 +; VI-NEXT: v_or_b32_sdwa v27, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v61 +; VI-NEXT: v_or_b32_e32 v4, v4, v29 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v29, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_e32 v3, s4, v4 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v29 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v5, v29, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v6, v29, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v8, v29, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v9, v29, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v10, v29, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v35 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_or_b32_sdwa v11, v29, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v52 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v12, v29, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v35, 24, v56 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v37 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v41 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v14, v29, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v2 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v62 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v55 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v36 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v29, v29, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v29 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v18, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v53 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v19, v29, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v57 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_or_b32_sdwa v20, v29, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v38 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v21, v29, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v22 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v55, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v54 +; VI-NEXT: v_lshlrev_b32_e32 v54, 24, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v55 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v40, 24, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v30, 24, v28 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v41, vcc, 3, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 +; VI-NEXT: v_and_b32_e32 v41, 0xff, v41 +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 +; VI-NEXT: v_or_b32_e32 v55, v40, v55 +; VI-NEXT: v_or_b32_sdwa v23, v55, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v42, vcc, 3, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v42, 0xff, v42 +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v42 +; VI-NEXT: v_or_b32_e32 v54, v54, v55 +; VI-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v53, vcc, 3, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v53, 0xff, v53 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_or_b32_e32 v35, v35, v53 +; VI-NEXT: v_or_b32_sdwa v25, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v52, vcc, 3, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v52, 0xff, v52 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; VI-NEXT: v_or_b32_e32 v28, v28, v32 +; VI-NEXT: v_or_b32_sdwa v28, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v51, vcc, 3, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v51, 0xff, v51 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; VI-NEXT: v_or_b32_e32 v29, v29, v32 +; VI-NEXT: v_or_b32_sdwa v29, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v49, vcc, 3, v34 +; VI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; VI-NEXT: v_or_b32_e32 v30, v30, v32 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v39 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_e32 v31, v31, v32 +; VI-NEXT: v_or_b32_sdwa v30, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 +; VI-NEXT: .LBB89_5: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v128i8_to_v64bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v44 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(36) +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB89_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v39, v16 +; GFX9-NEXT: v_or_b32_sdwa v17, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v42, v61 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v55, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v50, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v49, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_mov_b32_e32 v46, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v35, v45 +; GFX9-NEXT: v_mov_b32_e32 v45, v61 +; GFX9-NEXT: v_mov_b32_e32 v61, v42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v38, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v54, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB89_3 +; GFX9-NEXT: .LBB89_2: +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v33, v45 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: .LBB89_3: ; %Flow +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB89_5 +; GFX9-NEXT: ; %bb.4: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_lshl_b32 s6, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshl_b32 s9, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_lshl_b32 s10, s19, 8 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v25, v37, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v37, v51, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v38, v38, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s16, 0xff +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_and_b32 s9, s18, 0xff +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v39, v36, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v51, v34, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v2 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v52, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 +; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v56 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v23, v41, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v41, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v22 +; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v52 +; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v42, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v45, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v23 +; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v25 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 +; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v38 +; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v50 +; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v39 +; GFX9-NEXT: v_add_u32_e32 v39, 0x300, v49 +; GFX9-NEXT: v_add_u32_e32 v49, 0x300, v53 +; GFX9-NEXT: v_add_u32_e32 v50, 0x300, v55 +; GFX9-NEXT: v_add_u32_e32 v53, 0x300, v45 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v37, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v36, 16, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 +; GFX9-NEXT: .LBB89_5: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64bf16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:436 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:192 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64 +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB89_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v2, 0xffff, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v68 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v3, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v83 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v98 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v148 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v20, 16, v21 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v147 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v149 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v151 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v161 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v163 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v165 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v167 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v44 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v56 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v59 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v63 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v77 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v75 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v78 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v79 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v91 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v92 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB89_3 +; GFX11-TRUE16-NEXT: .LBB89_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v92 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v88 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v78 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v77 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v76 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v75 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v74 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v72 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v63 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v61 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v59 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v59, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v60, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v57 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v58 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v57, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v56 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v56, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v45 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v46 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v45, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v44 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v43 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v41 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v42 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v40 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v183 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v180 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v177 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v167 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v167, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v176, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v165 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v166 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v165, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v163 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v161 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v162 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v160 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v151 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v149 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v148 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v98 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v55, 3, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v97 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v86 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v81 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v69 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v70, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v69, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v64, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 3, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v53, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v49, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v50 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v38, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v36, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v37, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v34, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v53, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v49, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v69 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v165, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v161, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v15, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v14, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v59, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v35 +; GFX11-TRUE16-NEXT: .LBB89_3: ; %end +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:436 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB89_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB89_2 +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:320 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:20 -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v29 +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:240 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v40 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v45 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v46 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v56 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v44, 8, v60 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v61 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v62 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v72 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v73 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v74 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v75 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v76 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v77 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v78 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v79 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v89 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v90 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v76, 8, v95 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v104 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v105 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v94 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v28 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB89_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v5, 0xffff, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v4, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v51 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v49 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v132 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v161 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v167 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v7, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v9, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v11, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v130 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v150 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v40 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v43 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v46 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v57 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v56 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v10, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v12, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v14, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v16, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v119 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v149 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v144 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v146 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v148 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v58 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v44 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v60 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v47 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v72 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v74 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v73 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v13, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v15, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v17, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v19, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v21, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v180 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v75 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v61 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v78 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v77 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v79 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v76 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v90 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v89 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v92 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v91 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v20, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v22, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v104 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v95 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v105 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v94 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v108 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v107 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v110 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v109 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v23, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v29, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v31, v30, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v99 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v129 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v116 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v111 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v106 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v122 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v121 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v123 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v120 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v125 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v124 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v126 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v127 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v30, v29, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v32, v31, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v34, v33, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v36, v35, 0x5040100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 -; GFX11-FAKE16-NEXT: .LBB44_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v131, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v116, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v35, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v126, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v127, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v125, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v124, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v33, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v98, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v116, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v98, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v112, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v99, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v103, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v120, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v122, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v121, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v111, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v81, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v81, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v101, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v97, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v106, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v110, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v109, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v108, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB89_3 +; GFX11-FAKE16-NEXT: .LBB89_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v58 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v47 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v43 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v94, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v40 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v90, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v182 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v89, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v88, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v78, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v54, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v52, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v53, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v50, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v54 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v53 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v49, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v49, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v48, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v34, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v39 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 3, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v101, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v102, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v87, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v86, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v84, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v83, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v32, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v39, 0x300, v1 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v33, v38, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v32, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v34, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v52, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 -; GFX11-FAKE16-NEXT: .LBB44_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:580 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v82, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v81, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v80, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 3, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v69, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v112, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v68, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v67, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v65, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v64, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 +; GFX11-FAKE16-NEXT: .LBB89_3: ; %end +; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:440 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB89_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB89_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -79103,2093 +163098,2037 @@ end: } define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v64bf16_to_v128i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v41 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v40 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v58 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v46 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v48 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v1 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v42 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v45 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v12 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v14 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB45_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v28, v2, v3, 16 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v24, v2, v3, 16 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v25, v2, v3, 16 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v26, v2, v3, 16 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v30, v2, v3, 16 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v35 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v40, v2, v3, 16 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v39 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v42, v2, v3, 16 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v48 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v44, v2, v3, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v60 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v46, v2, v3, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v56, v2, v3, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v22 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v58, v2, v3, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v32 -; GCN-NEXT: v_alignbit_b32 v61, v2, v18, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_alignbit_b32 v15, v2, v19, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v21 -; GCN-NEXT: v_alignbit_b32 v14, v2, v17, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v31 -; GCN-NEXT: v_alignbit_b32 v13, v2, v27, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v29 -; GCN-NEXT: v_alignbit_b32 v59, v2, v33, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v35 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v39 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v48 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v60 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v20 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v22 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v34 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v11 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v43 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v47 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 24, v62 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v16 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 24, v23 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v51, v52, v16, 16 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v35, v12, v16, 16 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_alignbit_b32 v36, v36, v12, 16 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_alignbit_b32 v37, v37, v12, 16 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_alignbit_b32 v38, v38, v12, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v39, v6, v12, 16 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v48, v3, v12, 16 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v50, v5, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v52 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v52, v8, v3, 16 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v53, v10, v3, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v54, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v55, v7, v2, 16 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v41, v9, v2, 16 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v45, v11, v2, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v57, v1, v2, 16 -; GCN-NEXT: v_alignbit_b32 v4, v51, v28, 24 -; GCN-NEXT: v_alignbit_b32 v10, v51, v28, 16 -; GCN-NEXT: v_alignbit_b32 v3, v51, v28, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v35, v24, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v35, v24, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v8, v35, v24, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v36, v25, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v36, v25, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v12, v36, v25, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: .LBB45_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB45_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v29 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_alignbit_b32 v59, v13, v12, 16 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v31 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v21 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v63 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v15, v17, v15, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v23 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v62 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v47 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v43 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v34 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v20 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v42, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v60 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v44, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v46, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v56, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v58, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v62, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v7 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v4 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f32_e32 v60, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v18 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v19 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v23 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v27 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v31 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v33 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v35 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v36 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v37 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v38 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v39 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v48 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v49 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v50 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v51 -; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v52 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v53 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v54 -; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v55 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v40 -; GCN-NEXT: v_add_f32_e32 v50, 0x40c00000, v41 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v42 -; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v43 -; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v44 -; GCN-NEXT: v_add_f32_e32 v52, 0x40c00000, v45 -; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v46 -; GCN-NEXT: v_add_f32_e32 v39, 0x40c00000, v47 -; GCN-NEXT: v_add_f32_e32 v51, 0x40c00000, v56 -; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v57 -; GCN-NEXT: v_add_f32_e32 v47, 0x40c00000, v58 -; GCN-NEXT: v_add_f32_e32 v48, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v53, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v61 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v62 -; GCN-NEXT: v_add_f32_e32 v49, 0x40c00000, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v61, v1, v6, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-NEXT: v_alignbit_b32 v58, v1, v2, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; GCN-NEXT: v_alignbit_b32 v56, v1, v3, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; GCN-NEXT: v_alignbit_b32 v46, v1, v4, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; GCN-NEXT: v_alignbit_b32 v44, v1, v5, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GCN-NEXT: v_alignbit_b32 v42, v1, v33, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; GCN-NEXT: v_alignbit_b32 v40, v1, v8, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; GCN-NEXT: v_alignbit_b32 v30, v1, v30, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v50 -; GCN-NEXT: v_alignbit_b32 v26, v1, v25, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v52 -; GCN-NEXT: v_alignbit_b32 v25, v1, v41, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; GCN-NEXT: v_alignbit_b32 v24, v1, v51, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v55 -; GCN-NEXT: v_alignbit_b32 v28, v1, v53, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 24, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 24, v20 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 24, v60 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 24, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v57, v20, v21, 16 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v45, v18, v20, 16 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v41, v10, v18, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v55, v1, v10, 16 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v54, v62, v1, 16 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v53, v12, v1, 16 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v52, v11, v1, 16 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v50, v4, v1, 16 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v49, v3, v17, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v48, v2, v19, 16 -; GCN-NEXT: v_alignbit_b32 v39, v6, v16, 16 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v38, v29, v22, 16 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v37, v8, v23, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v36, v7, v43, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v5, v47, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v31 -; GCN-NEXT: v_alignbit_b32 v51, v5, v9, 16 -; GCN-NEXT: v_alignbit_b32 v4, v51, v28, 24 -; GCN-NEXT: v_alignbit_b32 v10, v51, v28, 16 -; GCN-NEXT: v_alignbit_b32 v3, v51, v28, 8 -; GCN-NEXT: v_alignbit_b32 v1, v35, v24, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v35, v24, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v8, v35, v24, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v36, v25, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v36, v25, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v12, v36, v25, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: .LBB45_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v51 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v4 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v8 -; GCN-NEXT: v_or_b32_e32 v29, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v31, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v12 -; GCN-NEXT: v_or_b32_e32 v2, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v36 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v62, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v26 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v30 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v38 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v40 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v7, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v8, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v42 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v9, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v48 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v10, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v44 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v11, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v49 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v16, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v46 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v17, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v50 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v18, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v56 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v19, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v52 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v20, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v58 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v21, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v53 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v22, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v61 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v23, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v54 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v24, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v15 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v15, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v55 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v25, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v14, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v41 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v26, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v13 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v13, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v27, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v59 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v12, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v57 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v28, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v30, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v33, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v32, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v34, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v35, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v36, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v37, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v38, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v39, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v48, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v49, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v50, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v51, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v52, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v53, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v54, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v55, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v40, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v41, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v42, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v43, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v44, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v45, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v46, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v56, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v57, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v58, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v59, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v60, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v4, v1, v30 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v5, v1, v33 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v63, v2, v32 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v61, v3, v34 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v35 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v36 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v37 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v30, v30, v38 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v39 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v48 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v8, v8, v49 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v50 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v11, v11, v52 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v16, v16, v53 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v17, v17, v54 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v55 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v19, v19, v40 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v41 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v21, v21, v42 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v43 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v23, v23, v44 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v15, v15, v45 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v25, v25, v46 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v47 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v26, v26, v56 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v57 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v27, v27, v58 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v59 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v28, v28, v60 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64bf16_to_v128i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v36 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v63 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v1 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v5 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v10 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB90_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_alignbit_b32 v38, v1, v2, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v35, v1, v2, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v32, v1, v2, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v29, v1, v2, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v26, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v40, v1, v2, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v23, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v54, v1, v2, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v18, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v52, v1, v2, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v15, v1, v57, 16 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v50, v1, v2, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v13, v1, v61, 16 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v48, v1, v2, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v11, v1, v60, 16 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v37, v1, v2, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v8, v1, v51, 16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v34, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_alignbit_b32 v5, v1, v53, 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_alignbit_b32 v31, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_alignbit_b32 v4, v1, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_alignbit_b32 v28, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_alignbit_b32 v3, v1, v41, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v25, v1, v2, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_alignbit_b32 v2, v1, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_alignbit_b32 v22, v1, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_alignbit_b32 v20, v7, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v44 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v40, v38, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v40, v38, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v40, v38, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v54, v35, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v54, v35, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v54, v35, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v52, v32, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v52, v32, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v52, v32, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v50, v29, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v50, v29, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v50, v29, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v48, v26, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v48, v26, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v48, v26, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v37, v23, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v37, v23, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v37, v23, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v34, v18, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v34, v18, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v34, v18, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v31, v15, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v31, v15, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v31, v15, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v28, v13, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v28, v13, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v28, v13, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v25, v11, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v25, v11, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v25, v11, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v22, v8, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v22, v8, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v22, v8, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v20, v5, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v19 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v20, v5, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v20, v5, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_alignbit_b32 v17, v7, v9, 16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v6, v17, v4, 24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v17, v4, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v27, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v17, v4, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v14, v3, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v14, v3, 16 +; SI-NEXT: v_alignbit_b32 v10, v7, v39, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v14, v3, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v10, v2, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v44 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v10, v2, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v43, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v7, v55, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v10, v2, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v7, v1, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v7, v1, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v7, v1, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v40 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v54 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v52 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v50 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v48 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v37 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v34 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v31 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v28 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v25 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v22 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v20 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v17 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v14 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v10 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v7 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v58 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v33 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v47 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v57 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v42 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v59 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v49 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v51 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v62 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v45 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v24 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v36 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB90_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB90_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v30 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v5, v8, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v8, v11, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v11, v14, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v17, v13, 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v19 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v17, v17, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v19 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v54 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v50 +; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v52 +; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v54 +; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v41 +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v20, v15, 16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v20, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v23, v18, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v21, 24, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v26, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v29, v26, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v32, v29, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_alignbit_b32 v32, v35, v32, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v35, v34, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v33 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; SI-NEXT: v_alignbit_b32 v35, v38, v35, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v36 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v37, v38, v37, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v36, 24, v36 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; SI-NEXT: v_alignbit_b32 v38, v49, v38, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v39 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v48, v49, v48, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v39 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; SI-NEXT: v_alignbit_b32 v50, v50, v49, 16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; SI-NEXT: v_alignbit_b32 v52, v52, v49, 16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; SI-NEXT: v_alignbit_b32 v54, v54, v49, 16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; SI-NEXT: v_alignbit_b32 v40, v40, v49, 16 +; SI-NEXT: v_alignbit_b32 v6, v40, v38, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v40, v38, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v40, v38, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v54, v35, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v54, v35, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v54, v35, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v52, v32, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v52, v32, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v52, v32, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v50, v29, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v50, v29, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v50, v29, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v48, v26, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v48, v26, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v48, v26, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v37, v23, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v37, v23, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v37, v23, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v34, v18, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v34, v18, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v34, v18, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v31, v15, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v31, v15, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v31, v15, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v28, v13, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v28, v13, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v28, v13, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v25, v11, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v25, v11, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v25, v11, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v22, v8, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v22, v8, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v22, v8, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v20, v5, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v20, v5, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v20, v5, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v17, v4, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v17, v4, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v17, v4, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v14, v3, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v14, v3, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v14, v3, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v10, v2, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v10, v2, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v10, v2, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v7, v1, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v7, v1, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v7, v1, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v40 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v54 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v52 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v50 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v48 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v37 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v34 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v31 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v28 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v25 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v22 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v41 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v17 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v55 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v14 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v53 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v10 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v51 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v7 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: .LBB90_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v35 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v54 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v29 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v50 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v26 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v48 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v23 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v37 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v18 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v34 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v15 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v31 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v13 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v25 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v128i8: ; VI: ; %bb.0: @@ -81396,7 +165335,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB45_2 +; VI-NEXT: s_cbranch_execz .LBB90_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v29 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill @@ -81584,12 +165523,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v47, v34 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: .LBB45_2: ; %Flow +; VI-NEXT: .LBB90_2: ; %Flow ; VI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_xor_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB45_4 +; VI-NEXT: s_cbranch_execz .LBB90_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v18 ; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 @@ -82347,7 +166286,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v17 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: .LBB45_4: ; %end +; VI-NEXT: .LBB90_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) @@ -82951,7 +166890,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB45_2 +; GFX9-NEXT: s_cbranch_execz .LBB90_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill @@ -83150,11 +167089,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB45_2: ; %Flow +; GFX9-NEXT: .LBB90_2: ; %Flow ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v58, v57 ; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB45_4 +; GFX9-NEXT: s_cbranch_execz .LBB90_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v18 ; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -83938,7 +167877,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v60 ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v59 ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v59 -; GFX9-NEXT: .LBB45_4: ; %end +; GFX9-NEXT: .LBB90_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -84483,7 +168422,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB45_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] @@ -84598,9 +168537,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v31.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v32.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v32.h -; GFX11-TRUE16-NEXT: .LBB45_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB90_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB45_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v17 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 @@ -85224,7 +169163,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v13 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v7 -; GFX11-TRUE16-NEXT: .LBB45_4: ; %end +; GFX11-TRUE16-NEXT: .LBB90_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v144.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v108.l @@ -85717,7 +169656,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB45_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB90_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] @@ -85816,9 +169755,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] -; GFX11-FAKE16-NEXT: .LBB45_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB90_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB45_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB90_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v18 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v17 @@ -86087,676 +170026,9180 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v32, v48, vcc_lo ; GFX11-FAKE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v51 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v51, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v51, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v49, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v32, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v96, v49, v48, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v51, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v50, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v53, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v51 +; GFX11-FAKE16-NEXT: v_perm_b32 v87, v29, v28, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v49, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v55, v51, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v32, v53, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v49, v50, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v54 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v49, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v52, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v49 +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v49, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v98, v50, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v52, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v51, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 16, v98 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 8, v98 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v50, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v50, v55, v52, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v54 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v53, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v51, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v52, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_perm_b32 v100, v3, v50, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v52, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v53, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v66, v53, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v53 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v97, v31, v30, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v103, v3, v51, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v3 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v6, v54 :: v_dual_add_f32 v6, 0x40c00000, v55 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v52, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_perm_b32 v102, v5, v53, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v54, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v99, v2, v1, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v8, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v52, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v67, v54, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v7, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v8, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v66 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v52, v67, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_perm_b32 v182, v7, v54, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v8, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v112, v55, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v183, v5, v6, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v52, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v10, v112, v55, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v67, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-FAKE16-NEXT: v_perm_b32 v101, v4, v49, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v55, v10, v66 :: v_dual_add_f32 v10, 0x40c00000, v67 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v50 +; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v52, v112 :: v_dual_lshlrev_b32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v176, v9, v55, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v52 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v66, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add3_u32 v67, v67, v12, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v9, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 0x40c00000, v11 :: v_dual_cndmask_b32 v10, v52, v66 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v113, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v177, v7, v8, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v32 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v67, v112 :: v_dual_lshlrev_b32 v67, 16, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v112, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v26 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v52, v66 :: v_dual_add_f32 v52, 0x40c00000, v67 +; GFX11-FAKE16-NEXT: v_add3_u32 v66, v112, v11, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v13 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v52, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v66, v67 :: v_dual_add_f32 v66, 0x40c00000, v112 +; GFX11-FAKE16-NEXT: v_add3_u32 v67, v113, v52, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v52 +; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v66, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v66 +; GFX11-FAKE16-NEXT: v_perm_b32 v162, v11, v9, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v67, v112, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v67, v113, v14, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_add3_u32 v113, v114, v66, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 16, v16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v67, v112 :: v_dual_add_f32 v67, 0x40c00000, v114 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-FAKE16-NEXT: v_add3_u32 v112, v116, v13, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v49, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v32 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v32, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v96, v49, v48, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v51, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v50, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v149, v14, v52, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v67, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v113, v115, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v113, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_perm_b32 v163, v12, v10, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v112, v113 :: v_dual_add_f32 v112, 0x40c00000, v115 +; GFX11-FAKE16-NEXT: v_add3_u32 v113, v114, v67, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v67 +; GFX11-FAKE16-NEXT: v_bfe_u32 v115, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v112, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v117, 0x400000, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_perm_b32 v148, v13, v66, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v113, v114, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v114, v115, v16, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v116, v116, v112, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[96:97] +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v114, v115, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v112, v112 +; GFX11-FAKE16-NEXT: v_add3_u32 v113, v113, v15, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[86:87] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[84:85] +; GFX11-FAKE16-NEXT: v_perm_b32 v135, v16, v67, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v112, v116, v117, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v53 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v113, v118, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX11-FAKE16-NEXT: v_perm_b32 v134, v15, v112, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v112 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[134:135] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[148:149] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[162:163] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[176:177] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[182:183] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[82:83] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v67 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v66 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v54 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[102:103] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[98:99] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[80:81] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v55 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[100:101] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[70:71] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[68:69] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v135 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v135 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v134 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v134 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v149 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v149 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v148 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v148 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v163 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 8, v163 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v162 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v162 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 24, v177 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v177 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v176 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v176 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v183 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v183 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v182 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v182 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v103 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v103 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v102 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v102 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 24, v101 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v101 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v99 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v99 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v97 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v96 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 24, v87 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v86 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 24, v85 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v84 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v83 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v82 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v82 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v81 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v71 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v77 +; GFX11-FAKE16-NEXT: .LBB90_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v76 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v66, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v166 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v160 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v65, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v65, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v132 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v161 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v65, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v72 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v39, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v59 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v146 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v40 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v38, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v178 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v103 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v36, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v113 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v100 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v112 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v33, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: s_clause 0x5 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-FAKE16-NEXT: s_clause 0x15 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:96 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64bf16_to_v128i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v49 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v50 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v52 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v53 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v54 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v55 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v41 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v43 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s29 +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB91_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v43, v36 +; SI-NEXT: v_alignbit_b32 v36, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_alignbit_b32 v6, v1, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_alignbit_b32 v2, v1, v13, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_alignbit_b32 v5, v1, v17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_alignbit_b32 v3, v1, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_alignbit_b32 v16, v1, v57, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v13, v1, v58, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_alignbit_b32 v10, v1, v60, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_alignbit_b32 v44, v19, v8, 16 +; SI-NEXT: v_alignbit_b32 v7, v1, v22, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_alignbit_b32 v8, v44, v36, 24 +; SI-NEXT: v_alignbit_b32 v60, v1, v27, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v44, v36, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39 +; SI-NEXT: v_alignbit_b32 v57, v1, v30, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v44, v36, 8 +; SI-NEXT: v_alignbit_b32 v58, v22, v9, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v37, 16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v58, v6, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v58, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v49 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v58, v6, 8 +; SI-NEXT: v_alignbit_b32 v47, v25, v12, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v47, v2, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v47, v2, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v47, v2, 8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v46 +; SI-NEXT: v_alignbit_b32 v53, v1, v48, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_alignbit_b32 v50, v8, v59, 16 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v50, v5, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v50, v5, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v50, v5, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v52, v1, v52, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_mov_b32_e32 v17, v63 +; SI-NEXT: v_alignbit_b32 v1, v1, v41, 16 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v31 +; SI-NEXT: v_alignbit_b32 v62, v8, v61, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v62, v4, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v62, v4, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_alignbit_b32 v55, v8, v63, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v55, v3, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v55, v3, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v55, v3, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v48, v62, v4, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_alignbit_b32 v38, v8, v45, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v38, v16, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_alignbit_b32 v35, v8, v18, 16 +; SI-NEXT: v_mov_b32_e32 v45, v8 +; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v29, v35, v13, 8 +; SI-NEXT: v_alignbit_b32 v61, v38, v16, 24 +; SI-NEXT: v_alignbit_b32 v41, v38, v16, 16 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_alignbit_b32 v30, v8, v21, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v30, v10, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v30, v10, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v30, v10, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_alignbit_b32 v27, v8, v24, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v27, v7, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v27, v7, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v27, v7, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_alignbit_b32 v24, v8, v26, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v24, v60, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v24, v60, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_alignbit_b32 v21, v8, v14, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v21, v57, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v21, v57, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v21, v57, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v34 +; SI-NEXT: v_alignbit_b32 v18, v8, v15, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v18, v40, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v18, v40, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v8, v18, v40, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v28 +; SI-NEXT: v_alignbit_b32 v63, v8, v51, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v63, v53, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v63, v53, 16 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v33 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v63, v53, 8 +; SI-NEXT: v_alignbit_b32 v12, v40, v43, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v12, v52, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v12, v52, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v12, v52, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v42 +; SI-NEXT: v_mov_b32_e32 v15, v9 +; SI-NEXT: v_alignbit_b32 v9, v8, v54, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v49 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v46 +; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v56 +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v8 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v34 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v38 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v28 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v15 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v33 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v59 +; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v20 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v12 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v30 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v24 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v42 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v39 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v23 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v9 +; SI-NEXT: v_alignbit_b32 v26, v24, v60, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v44 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v58 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v47 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v50 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v62 +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v55 +; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v35 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v27 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, v28 +; SI-NEXT: v_mov_b32_e32 v23, v48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v63 +; SI-NEXT: v_mov_b32_e32 v48, v33 +; SI-NEXT: v_mov_b32_e32 v34, v53 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_branch .LBB91_3 +; SI-NEXT: .LBB91_2: +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v48, v33 +; SI-NEXT: v_mov_b32_e32 v29, v28 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v17, v63 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: .LBB91_3: ; %Flow +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v56, v17 +; SI-NEXT: v_mov_b32_e32 v54, v61 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB91_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_alignbit_b32 v52, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v57, v7, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 +; SI-NEXT: v_alignbit_b32 v9, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v32 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_alignbit_b32 v12, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v60, v10, v6, 16 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v13, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_alignbit_b32 v63, v13, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v14, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_alignbit_b32 v18, v14, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v21, v15, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v16, v13, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v19, v16, 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v24, v15, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v27, v15, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v3, v22, v19, 16 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v54 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v44 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v59 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v30, v15, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_alignbit_b32 v4, v25, v22, 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v40 +; SI-NEXT: v_alignbit_b32 v35, v45, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_alignbit_b32 v5, v28, v25, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v41 +; SI-NEXT: v_alignbit_b32 v38, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v56 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_alignbit_b32 v2, v33, v28, 16 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v43 +; SI-NEXT: v_alignbit_b32 v55, v61, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_alignbit_b32 v6, v36, v33, 16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v36 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v46 +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; SI-NEXT: v_alignbit_b32 v62, v15, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_alignbit_b32 v36, v39, v36, 16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v42 +; SI-NEXT: v_alignbit_b32 v50, v17, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v56 +; SI-NEXT: v_alignbit_b32 v47, v25, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v39 +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v50 +; SI-NEXT: v_alignbit_b32 v58, v22, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v56 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v42 +; SI-NEXT: v_lshrrev_b32_e32 v42, 8, v63 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v46 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v55 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v43 +; SI-NEXT: v_alignbit_b32 v43, v38, v16, 8 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v41 +; SI-NEXT: v_alignbit_b32 v41, v38, v16, 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v40 +; SI-NEXT: v_mov_b32_e32 v40, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v54 +; SI-NEXT: v_alignbit_b32 v54, v38, v16, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v20 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v35, v13, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v35 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v23 +; SI-NEXT: v_alignbit_b32 v23, v62, v4, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v49 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v47 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v62 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v58 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v26 +; SI-NEXT: v_alignbit_b32 v26, v24, v60, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v44, v19, v14, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v59 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v44, v36, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v44, v36, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v44, v36, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v58, v6, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v58, v6, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v58, v6, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v47, v2, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v47, v2, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v47, v2, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v50, v5, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v50, v5, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v50, v5, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v62, v4, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v62, v4, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v55, v3, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v55, v3, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v55, v3, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v30, v10, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v30, v10, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v30, v10, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v27, v7, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v27, v7, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v27, v7, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v24, v60, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v24, v60, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v21, v57, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v21, v57, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v21, v57, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v18, v51, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v18, v51, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v8, v18, v51, 8 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v44 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v21 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v63, v34, 24 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v9 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v63, v34, 16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v63, v34, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v12, v52, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v12, v52, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v12, v52, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v38 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v30 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v27 +; SI-NEXT: .LBB91_5: ; %end +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v36, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v28 +; SI-NEXT: v_or_b32_e32 v32, v36, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v36, 0xff, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v29 +; SI-NEXT: v_or_b32_e32 v36, v56, v36 +; SI-NEXT: v_or_b32_e32 v32, v32, v36 +; SI-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v51 +; SI-NEXT: v_or_b32_e32 v32, v32, v36 +; SI-NEXT: v_and_b32_e32 v36, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_or_b32_e32 v14, v14, v36 +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v14, v32, v14 +; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v14, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v19 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v32 +; SI-NEXT: v_add_i32_e32 v32, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v6 +; SI-NEXT: v_or_b32_e32 v14, v32, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v60 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v28, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v49 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v2 +; SI-NEXT: v_or_b32_e32 v14, v28, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v57 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v25, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v39 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v2 +; SI-NEXT: v_or_b32_e32 v14, v25, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v23 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v17 +; SI-NEXT: v_or_b32_e32 v14, v22, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v17 +; SI-NEXT: v_or_b32_e32 v14, v22, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v46 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v17 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v43 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v54 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v38 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v2 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v20 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v30 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v27 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v21 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v42 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v128i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v63, s30, 0 +; VI-NEXT: v_writelane_b32 v63, s31, 1 +; VI-NEXT: v_writelane_b32 v63, s34, 2 +; VI-NEXT: v_writelane_b32 v63, s35, 3 +; VI-NEXT: v_writelane_b32 v63, s36, 4 +; VI-NEXT: v_writelane_b32 v63, s37, 5 +; VI-NEXT: v_writelane_b32 v63, s38, 6 +; VI-NEXT: v_writelane_b32 v63, s39, 7 +; VI-NEXT: v_writelane_b32 v63, s48, 8 +; VI-NEXT: v_writelane_b32 v63, s49, 9 +; VI-NEXT: v_writelane_b32 v63, s50, 10 +; VI-NEXT: v_writelane_b32 v63, s51, 11 +; VI-NEXT: v_writelane_b32 v63, s52, 12 +; VI-NEXT: v_writelane_b32 v63, s53, 13 +; VI-NEXT: v_writelane_b32 v63, s54, 14 +; VI-NEXT: v_writelane_b32 v63, s55, 15 +; VI-NEXT: v_writelane_b32 v63, s64, 16 +; VI-NEXT: v_writelane_b32 v63, s65, 17 +; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_writelane_b32 v63, s68, 20 +; VI-NEXT: v_writelane_b32 v63, s69, 21 +; VI-NEXT: v_writelane_b32 v63, s70, 22 +; VI-NEXT: v_writelane_b32 v63, s71, 23 +; VI-NEXT: v_writelane_b32 v63, s80, 24 +; VI-NEXT: v_writelane_b32 v63, s81, 25 +; VI-NEXT: v_writelane_b32 v63, s82, 26 +; VI-NEXT: v_writelane_b32 v63, s83, 27 +; VI-NEXT: v_writelane_b32 v63, s84, 28 +; VI-NEXT: v_writelane_b32 v63, s85, 29 +; VI-NEXT: v_writelane_b32 v63, s86, 30 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_writelane_b32 v63, s87, 31 +; VI-NEXT: v_readfirstlane_b32 s44, v3 +; VI-NEXT: v_readfirstlane_b32 s45, v4 +; VI-NEXT: v_readfirstlane_b32 s42, v5 +; VI-NEXT: v_readfirstlane_b32 s43, v6 +; VI-NEXT: v_readfirstlane_b32 s40, v7 +; VI-NEXT: v_readfirstlane_b32 s41, v8 +; VI-NEXT: v_readfirstlane_b32 s14, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s12, v11 +; VI-NEXT: v_readfirstlane_b32 s13, v12 +; VI-NEXT: v_readfirstlane_b32 s10, v13 +; VI-NEXT: v_readfirstlane_b32 s11, v14 +; VI-NEXT: v_readfirstlane_b32 s8, v15 +; VI-NEXT: v_readfirstlane_b32 s9, v16 +; VI-NEXT: v_readfirstlane_b32 s6, v17 +; VI-NEXT: v_readfirstlane_b32 s7, v18 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; VI-NEXT: s_cbranch_scc0 .LBB91_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 57 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s27, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s26, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s26, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s25, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s25, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s24, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s24, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s23, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s23, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s23, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s22, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s22, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s21, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s20, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s20, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s19, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s19, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s19, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s18, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s17, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s17, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s17, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s16, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s16, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 7 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 6 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 5 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 4 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 3 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 2 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 1 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s80, s13, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 0 +; VI-NEXT: s_lshr_b32 s81, s12, 8 +; VI-NEXT: s_lshr_b32 s82, s15, 24 +; VI-NEXT: s_lshr_b32 s83, s15, 16 +; VI-NEXT: s_lshr_b32 s85, s15, 8 +; VI-NEXT: s_lshr_b32 s84, s14, 16 +; VI-NEXT: s_lshr_b32 s86, s14, 8 +; VI-NEXT: s_lshr_b32 s87, s41, 24 +; VI-NEXT: s_lshr_b32 s50, s41, 16 +; VI-NEXT: s_lshr_b32 s52, s41, 8 +; VI-NEXT: s_lshr_b32 s51, s40, 16 +; VI-NEXT: s_lshr_b32 s53, s40, 8 +; VI-NEXT: s_lshr_b32 s54, s43, 24 +; VI-NEXT: s_lshr_b32 s55, s43, 16 +; VI-NEXT: s_lshr_b32 s65, s43, 8 +; VI-NEXT: s_lshr_b32 s64, s42, 16 +; VI-NEXT: s_lshr_b32 s66, s42, 8 +; VI-NEXT: s_lshr_b32 s67, s45, 24 +; VI-NEXT: s_lshr_b32 s68, s45, 16 +; VI-NEXT: s_lshr_b32 s70, s45, 8 +; VI-NEXT: s_lshr_b32 s69, s44, 16 +; VI-NEXT: s_lshr_b32 s71, s44, 8 +; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[44:45], 24 +; VI-NEXT: s_cbranch_execnz .LBB91_4 +; VI-NEXT: .LBB91_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s46, s45, 16 +; VI-NEXT: v_mov_b32_e32 v31, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s46, v31 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s45, s45, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s45, v31 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s45, s44, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s45, v31 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s44, s44, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s44, v31 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s44, s43, 16 +; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; VI-NEXT: v_add_f32_e32 v3, s44, v31 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s43, s43, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s43, v31 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s43, s42, 16 +; VI-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; VI-NEXT: v_add_f32_e32 v3, s43, v31 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s42, s42, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s42, v31 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: s_lshl_b32 s42, s41, 16 +; VI-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; VI-NEXT: v_add_f32_e32 v5, s42, v31 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_and_b32 s41, s41, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_f32_e32 v6, s41, v31 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: s_lshl_b32 s41, s40, 16 +; VI-NEXT: v_alignbit_b32 v6, v6, v5, 16 +; VI-NEXT: v_add_f32_e32 v5, s41, v31 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_and_b32 s40, s40, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; VI-NEXT: v_add_f32_e32 v7, s40, v31 +; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: s_lshl_b32 s40, s15, 16 +; VI-NEXT: v_alignbit_b32 v5, v7, v5, 16 +; VI-NEXT: v_add_f32_e32 v7, s40, v31 +; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s15, s15, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_add_f32_e32 v8, s15, v31 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: s_lshl_b32 s15, s14, 16 +; VI-NEXT: v_alignbit_b32 v8, v8, v7, 16 +; VI-NEXT: v_add_f32_e32 v7, s15, v31 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s14, s14, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; VI-NEXT: v_add_f32_e32 v9, s14, v31 +; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: s_lshl_b32 s14, s13, 16 +; VI-NEXT: v_alignbit_b32 v7, v9, v7, 16 +; VI-NEXT: v_add_f32_e32 v9, s14, v31 +; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_and_b32 s13, s13, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; VI-NEXT: v_add_f32_e32 v10, s13, v31 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: s_lshl_b32 s13, s12, 16 +; VI-NEXT: v_alignbit_b32 v10, v10, v9, 16 +; VI-NEXT: v_add_f32_e32 v9, s13, v31 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_and_b32 s12, s12, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc +; VI-NEXT: v_add_f32_e32 v11, s12, v31 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: s_lshl_b32 s12, s11, 16 +; VI-NEXT: v_alignbit_b32 v9, v11, v9, 16 +; VI-NEXT: v_add_f32_e32 v11, s12, v31 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s11, s11, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_add_f32_e32 v12, s11, v31 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: s_lshl_b32 s11, s10, 16 +; VI-NEXT: v_alignbit_b32 v12, v12, v11, 16 +; VI-NEXT: v_add_f32_e32 v11, s11, v31 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s10, s10, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc +; VI-NEXT: v_add_f32_e32 v13, s10, v31 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: s_lshl_b32 s10, s9, 16 +; VI-NEXT: v_alignbit_b32 v11, v13, v11, 16 +; VI-NEXT: v_add_f32_e32 v13, s10, v31 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s9, s9, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; VI-NEXT: v_add_f32_e32 v14, s9, v31 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: s_lshl_b32 s9, s8, 16 +; VI-NEXT: v_alignbit_b32 v14, v14, v13, 16 +; VI-NEXT: v_add_f32_e32 v13, s9, v31 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s8, s8, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc +; VI-NEXT: v_add_f32_e32 v15, s8, v31 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: s_lshl_b32 s8, s7, 16 +; VI-NEXT: v_alignbit_b32 v13, v15, v13, 16 +; VI-NEXT: v_add_f32_e32 v15, s8, v31 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: s_and_b32 s7, s7, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; VI-NEXT: v_add_f32_e32 v16, s7, v31 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: s_lshl_b32 s7, s6, 16 +; VI-NEXT: v_alignbit_b32 v16, v16, v15, 16 +; VI-NEXT: v_add_f32_e32 v15, s7, v31 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: s_and_b32 s6, s6, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; VI-NEXT: v_add_f32_e32 v17, s6, v31 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_alignbit_b32 v15, v17, v15, 16 +; VI-NEXT: v_add_f32_e32 v17, s6, v31 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_add_f32_e32 v18, s6, v31 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: s_lshl_b32 s6, s16, 16 +; VI-NEXT: v_alignbit_b32 v18, v18, v17, 16 +; VI-NEXT: v_add_f32_e32 v17, s6, v31 +; VI-NEXT: v_bfe_u32 v19, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v17 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc +; VI-NEXT: v_add_f32_e32 v19, s6, v31 +; VI-NEXT: v_bfe_u32 v20, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: s_lshl_b32 s6, s19, 16 +; VI-NEXT: v_alignbit_b32 v17, v19, v17, 16 +; VI-NEXT: v_add_f32_e32 v19, s6, v31 +; VI-NEXT: v_bfe_u32 v20, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc +; VI-NEXT: v_add_f32_e32 v20, s6, v31 +; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: s_lshl_b32 s6, s18, 16 +; VI-NEXT: v_alignbit_b32 v20, v20, v19, 16 +; VI-NEXT: v_add_f32_e32 v19, s6, v31 +; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v19, v21, v22, vcc +; VI-NEXT: v_add_f32_e32 v21, s6, v31 +; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: s_lshl_b32 s6, s21, 16 +; VI-NEXT: v_alignbit_b32 v19, v21, v19, 16 +; VI-NEXT: v_add_f32_e32 v21, s6, v31 +; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc +; VI-NEXT: v_add_f32_e32 v22, s6, v31 +; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: s_lshl_b32 s6, s20, 16 +; VI-NEXT: v_alignbit_b32 v22, v22, v21, 16 +; VI-NEXT: v_add_f32_e32 v21, s6, v31 +; VI-NEXT: v_bfe_u32 v23, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v21 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v24, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v21, v23, v24, vcc +; VI-NEXT: v_add_f32_e32 v23, s6, v31 +; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: s_lshl_b32 s6, s23, 16 +; VI-NEXT: v_alignbit_b32 v21, v23, v21, 16 +; VI-NEXT: v_add_f32_e32 v23, s6, v31 +; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc +; VI-NEXT: v_add_f32_e32 v24, s6, v31 +; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: s_lshl_b32 s6, s22, 16 +; VI-NEXT: v_alignbit_b32 v24, v24, v23, 16 +; VI-NEXT: v_add_f32_e32 v23, s6, v31 +; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v23, v25, v26, vcc +; VI-NEXT: v_add_f32_e32 v25, s6, v31 +; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: s_lshl_b32 s6, s25, 16 +; VI-NEXT: v_alignbit_b32 v23, v25, v23, 16 +; VI-NEXT: v_add_f32_e32 v25, s6, v31 +; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc +; VI-NEXT: v_add_f32_e32 v26, s6, v31 +; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: s_lshl_b32 s6, s24, 16 +; VI-NEXT: v_alignbit_b32 v26, v26, v25, 16 +; VI-NEXT: v_add_f32_e32 v25, s6, v31 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v25, v27, v28, vcc +; VI-NEXT: v_add_f32_e32 v27, s6, v31 +; VI-NEXT: v_bfe_u32 v28, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: s_lshl_b32 s6, s27, 16 +; VI-NEXT: v_alignbit_b32 v25, v27, v25, 16 +; VI-NEXT: v_add_f32_e32 v27, s6, v31 +; VI-NEXT: v_bfe_u32 v28, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc +; VI-NEXT: v_add_f32_e32 v28, s6, v31 +; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: s_lshl_b32 s6, s26, 16 +; VI-NEXT: v_alignbit_b32 v28, v28, v27, 16 +; VI-NEXT: v_add_f32_e32 v27, s6, v31 +; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v27, v29, v30, vcc +; VI-NEXT: v_add_f32_e32 v29, s6, v31 +; VI-NEXT: v_bfe_u32 v30, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v30, v32, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: s_lshl_b32 s6, s29, 16 +; VI-NEXT: v_alignbit_b32 v27, v29, v27, 16 +; VI-NEXT: v_add_f32_e32 v29, s6, v31 +; VI-NEXT: v_bfe_u32 v30, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v29, v30, v32, vcc +; VI-NEXT: v_add_f32_e32 v30, s6, v31 +; VI-NEXT: v_bfe_u32 v32, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v30 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: s_lshl_b32 s6, s28, 16 +; VI-NEXT: v_alignbit_b32 v30, v30, v29, 16 +; VI-NEXT: v_add_f32_e32 v29, s6, v31 +; VI-NEXT: v_bfe_u32 v32, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v29 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc +; VI-NEXT: v_add_f32_e32 v32, s6, v31 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: s_lshl_b32 s6, s5, 16 +; VI-NEXT: v_alignbit_b32 v29, v32, v29, 16 +; VI-NEXT: v_add_f32_e32 v32, s6, v31 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: s_and_b32 s5, s5, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_add_f32_e32 v33, s5, v31 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; VI-NEXT: s_lshl_b32 s5, s4, 16 +; VI-NEXT: v_alignbit_b32 v32, v33, v32, 16 +; VI-NEXT: v_add_f32_e32 v33, s5, v31 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: s_and_b32 s4, s4, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v31, s4, v31 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v31 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_alignbit_b32 v31, v31, v33, 16 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v2 +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v13 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v7 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v12 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v10 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v9 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v3 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v1 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: s_branch .LBB91_5 +; VI-NEXT: .LBB91_3: +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB91_2 +; VI-NEXT: .LBB91_4: +; VI-NEXT: v_mov_b32_e32 v33, s71 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s69 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s70 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s68 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s67 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s86 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s83 +; VI-NEXT: v_mov_b32_e32 v31, s4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s82 +; VI-NEXT: v_readlane_b32 s4, v62, 0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 1 +; VI-NEXT: v_mov_b32_e32 v40, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 2 +; VI-NEXT: v_mov_b32_e32 v44, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 3 +; VI-NEXT: v_mov_b32_e32 v54, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 4 +; VI-NEXT: v_mov_b32_e32 v53, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 5 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 6 +; VI-NEXT: v_mov_b32_e32 v51, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 7 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 8 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 9 +; VI-NEXT: v_mov_b32_e32 v56, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 10 +; VI-NEXT: v_mov_b32_e32 v47, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 11 +; VI-NEXT: v_mov_b32_e32 v48, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 12 +; VI-NEXT: v_mov_b32_e32 v43, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 13 +; VI-NEXT: v_mov_b32_e32 v46, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 14 +; VI-NEXT: v_mov_b32_e32 v50, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 19 +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 20 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 21 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 22 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 23 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 24 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 25 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 26 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 27 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 28 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 29 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 30 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 31 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 32 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 33 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 34 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 35 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 36 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 37 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 38 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 39 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 40 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 41 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 42 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 43 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 44 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 45 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 46 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 47 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 48 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 49 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 50 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 51 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 52 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 53 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 54 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 55 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 56 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 57 +; VI-NEXT: v_mov_b32_e32 v42, s54 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_mov_b32_e32 v41, s46 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s56 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s58 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s60 +; VI-NEXT: v_mov_b32_e32 v45, s72 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v45, s74 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v45, s76 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v45, s78 +; VI-NEXT: v_mov_b32_e32 v55, s88 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v36, s66 +; VI-NEXT: v_mov_b32_e32 v52, s64 +; VI-NEXT: v_mov_b32_e32 v55, v50 +; VI-NEXT: v_mov_b32_e32 v35, s30 +; VI-NEXT: v_mov_b32_e32 v59, s87 +; VI-NEXT: v_mov_b32_e32 v58, s34 +; VI-NEXT: v_mov_b32_e32 v45, s36 +; VI-NEXT: v_mov_b32_e32 v34, s38 +; VI-NEXT: v_mov_b32_e32 v1, s44 +; VI-NEXT: v_mov_b32_e32 v2, s45 +; VI-NEXT: v_mov_b32_e32 v3, s42 +; VI-NEXT: v_mov_b32_e32 v4, s43 +; VI-NEXT: v_mov_b32_e32 v5, s40 +; VI-NEXT: v_mov_b32_e32 v6, s41 +; VI-NEXT: v_mov_b32_e32 v7, s14 +; VI-NEXT: v_mov_b32_e32 v8, s15 +; VI-NEXT: v_mov_b32_e32 v9, s12 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v11, s10 +; VI-NEXT: v_mov_b32_e32 v12, s11 +; VI-NEXT: v_mov_b32_e32 v13, s8 +; VI-NEXT: v_mov_b32_e32 v14, s9 +; VI-NEXT: v_mov_b32_e32 v15, s6 +; VI-NEXT: v_mov_b32_e32 v16, s7 +; VI-NEXT: v_mov_b32_e32 v17, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v21, s20 +; VI-NEXT: v_mov_b32_e32 v22, s21 +; VI-NEXT: v_mov_b32_e32 v23, s22 +; VI-NEXT: v_mov_b32_e32 v24, s23 +; VI-NEXT: v_mov_b32_e32 v25, s24 +; VI-NEXT: v_mov_b32_e32 v26, s25 +; VI-NEXT: v_mov_b32_e32 v27, s26 +; VI-NEXT: v_mov_b32_e32 v28, s27 +; VI-NEXT: v_mov_b32_e32 v29, s28 +; VI-NEXT: v_mov_b32_e32 v30, s29 +; VI-NEXT: v_mov_b32_e32 v32, s5 +; VI-NEXT: v_mov_b32_e32 v41, s62 +; VI-NEXT: v_mov_b32_e32 v57, s81 +; VI-NEXT: v_mov_b32_e32 v37, s84 +; VI-NEXT: v_mov_b32_e32 v60, s52 +; VI-NEXT: v_mov_b32_e32 v38, s51 +; VI-NEXT: v_mov_b32_e32 v61, s65 +; VI-NEXT: v_mov_b32_e32 v49, s66 +; VI-NEXT: v_mov_b32_e32 v39, s55 +; VI-NEXT: v_mov_b32_e32 v50, v46 +; VI-NEXT: v_mov_b32_e32 v46, v48 +; VI-NEXT: v_mov_b32_e32 v48, v47 +; VI-NEXT: v_mov_b32_e32 v47, v56 +; VI-NEXT: v_mov_b32_e32 v56, v51 +; VI-NEXT: v_mov_b32_e32 v51, s90 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s85 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v34, s48 +; VI-NEXT: v_mov_b32_e32 v51, v53 +; VI-NEXT: v_mov_b32_e32 v53, v54 +; VI-NEXT: v_mov_b32_e32 v54, v40 +; VI-NEXT: v_mov_b32_e32 v40, s80 +; VI-NEXT: v_mov_b32_e32 v58, s50 +; VI-NEXT: v_mov_b32_e32 v45, s53 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: .LBB91_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v17, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_readlane_b32 s87, v63, 31 +; VI-NEXT: v_readlane_b32 s86, v63, 30 +; VI-NEXT: v_readlane_b32 s85, v63, 29 +; VI-NEXT: v_readlane_b32 s84, v63, 28 +; VI-NEXT: v_readlane_b32 s83, v63, 27 +; VI-NEXT: v_readlane_b32 s82, v63, 26 +; VI-NEXT: v_readlane_b32 s81, v63, 25 +; VI-NEXT: v_readlane_b32 s80, v63, 24 +; VI-NEXT: v_readlane_b32 s71, v63, 23 +; VI-NEXT: v_readlane_b32 s70, v63, 22 +; VI-NEXT: v_readlane_b32 s69, v63, 21 +; VI-NEXT: v_readlane_b32 s68, v63, 20 +; VI-NEXT: v_readlane_b32 s67, v63, 19 +; VI-NEXT: v_readlane_b32 s66, v63, 18 +; VI-NEXT: v_readlane_b32 s65, v63, 17 +; VI-NEXT: v_readlane_b32 s64, v63, 16 +; VI-NEXT: v_readlane_b32 s55, v63, 15 +; VI-NEXT: v_readlane_b32 s54, v63, 14 +; VI-NEXT: v_readlane_b32 s53, v63, 13 +; VI-NEXT: v_readlane_b32 s52, v63, 12 +; VI-NEXT: v_readlane_b32 s51, v63, 11 +; VI-NEXT: v_readlane_b32 s50, v63, 10 +; VI-NEXT: v_readlane_b32 s49, v63, 9 +; VI-NEXT: v_readlane_b32 s48, v63, 8 +; VI-NEXT: v_readlane_b32 s39, v63, 7 +; VI-NEXT: v_readlane_b32 s38, v63, 6 +; VI-NEXT: v_readlane_b32 s37, v63, 5 +; VI-NEXT: v_readlane_b32 s36, v63, 4 +; VI-NEXT: v_readlane_b32 s35, v63, 3 +; VI-NEXT: v_readlane_b32 s34, v63, 2 +; VI-NEXT: v_readlane_b32 s31, v63, 1 +; VI-NEXT: v_readlane_b32 s30, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v34, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v17, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v41 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v17, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v17, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v17, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v17, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v17, v32, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v17, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v59 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v128i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v63, s30, 0 +; GFX9-NEXT: v_writelane_b32 v63, s31, 1 +; GFX9-NEXT: v_writelane_b32 v63, s34, 2 +; GFX9-NEXT: v_writelane_b32 v63, s35, 3 +; GFX9-NEXT: v_writelane_b32 v63, s36, 4 +; GFX9-NEXT: v_writelane_b32 v63, s37, 5 +; GFX9-NEXT: v_writelane_b32 v63, s38, 6 +; GFX9-NEXT: v_writelane_b32 v63, s39, 7 +; GFX9-NEXT: v_writelane_b32 v63, s48, 8 +; GFX9-NEXT: v_writelane_b32 v63, s49, 9 +; GFX9-NEXT: v_writelane_b32 v63, s50, 10 +; GFX9-NEXT: v_writelane_b32 v63, s51, 11 +; GFX9-NEXT: v_writelane_b32 v63, s52, 12 +; GFX9-NEXT: v_writelane_b32 v63, s53, 13 +; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_writelane_b32 v63, s64, 16 +; GFX9-NEXT: v_writelane_b32 v63, s65, 17 +; GFX9-NEXT: v_writelane_b32 v63, s66, 18 +; GFX9-NEXT: v_writelane_b32 v63, s67, 19 +; GFX9-NEXT: v_writelane_b32 v63, s68, 20 +; GFX9-NEXT: v_writelane_b32 v63, s69, 21 +; GFX9-NEXT: v_writelane_b32 v63, s70, 22 +; GFX9-NEXT: v_writelane_b32 v63, s71, 23 +; GFX9-NEXT: v_writelane_b32 v63, s80, 24 +; GFX9-NEXT: v_writelane_b32 v63, s81, 25 +; GFX9-NEXT: v_writelane_b32 v63, s82, 26 +; GFX9-NEXT: v_writelane_b32 v63, s83, 27 +; GFX9-NEXT: v_writelane_b32 v63, s84, 28 +; GFX9-NEXT: v_writelane_b32 v63, s85, 29 +; GFX9-NEXT: v_writelane_b32 v63, s86, 30 +; GFX9-NEXT: v_writelane_b32 v63, s87, 31 +; GFX9-NEXT: v_writelane_b32 v63, s96, 32 +; GFX9-NEXT: v_writelane_b32 v63, s97, 33 +; GFX9-NEXT: v_writelane_b32 v63, s98, 34 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_writelane_b32 v63, s99, 35 +; GFX9-NEXT: v_readfirstlane_b32 s76, v3 +; GFX9-NEXT: v_readfirstlane_b32 s77, v4 +; GFX9-NEXT: v_readfirstlane_b32 s74, v5 +; GFX9-NEXT: v_readfirstlane_b32 s75, v6 +; GFX9-NEXT: v_readfirstlane_b32 s72, v7 +; GFX9-NEXT: v_readfirstlane_b32 s73, v8 +; GFX9-NEXT: v_readfirstlane_b32 s62, v9 +; GFX9-NEXT: v_readfirstlane_b32 s63, v10 +; GFX9-NEXT: v_readfirstlane_b32 s60, v11 +; GFX9-NEXT: v_readfirstlane_b32 s61, v12 +; GFX9-NEXT: v_readfirstlane_b32 s58, v13 +; GFX9-NEXT: v_readfirstlane_b32 s59, v14 +; GFX9-NEXT: v_readfirstlane_b32 s56, v15 +; GFX9-NEXT: v_readfirstlane_b32 s57, v16 +; GFX9-NEXT: v_readfirstlane_b32 s46, v17 +; GFX9-NEXT: v_readfirstlane_b32 s47, v18 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; GFX9-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s6, s5, 24 +; GFX9-NEXT: v_writelane_b32 v62, s6, 18 +; GFX9-NEXT: s_lshr_b32 s6, s5, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 17 +; GFX9-NEXT: s_lshr_b32 s6, s5, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 19 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 20 +; GFX9-NEXT: s_lshr_b32 s6, s4, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 21 +; GFX9-NEXT: s_lshr_b32 s6, s29, 24 +; GFX9-NEXT: v_writelane_b32 v62, s6, 22 +; GFX9-NEXT: s_lshr_b32 s6, s29, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 16 +; GFX9-NEXT: s_lshr_b32 s6, s29, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 23 +; GFX9-NEXT: s_lshr_b32 s6, s28, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 24 +; GFX9-NEXT: s_lshr_b32 s6, s28, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 25 +; GFX9-NEXT: s_lshr_b32 s6, s27, 24 +; GFX9-NEXT: v_writelane_b32 v62, s6, 26 +; GFX9-NEXT: s_lshr_b32 s6, s27, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 15 +; GFX9-NEXT: s_lshr_b32 s6, s27, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 27 +; GFX9-NEXT: s_lshr_b32 s6, s26, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 28 +; GFX9-NEXT: s_lshr_b32 s6, s26, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 29 +; GFX9-NEXT: s_lshr_b32 s6, s25, 24 +; GFX9-NEXT: v_writelane_b32 v62, s6, 30 +; GFX9-NEXT: s_lshr_b32 s6, s25, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 14 +; GFX9-NEXT: s_lshr_b32 s6, s25, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 31 +; GFX9-NEXT: s_lshr_b32 s6, s24, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 32 +; GFX9-NEXT: s_lshr_b32 s6, s24, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 33 +; GFX9-NEXT: s_lshr_b32 s6, s23, 24 +; GFX9-NEXT: v_writelane_b32 v62, s6, 34 +; GFX9-NEXT: s_lshr_b32 s6, s23, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 13 +; GFX9-NEXT: s_lshr_b32 s6, s23, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 35 +; GFX9-NEXT: s_lshr_b32 s6, s22, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 36 +; GFX9-NEXT: s_lshr_b32 s6, s22, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 37 +; GFX9-NEXT: s_lshr_b32 s6, s21, 24 +; GFX9-NEXT: v_writelane_b32 v62, s6, 38 +; GFX9-NEXT: s_lshr_b32 s6, s21, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 12 +; GFX9-NEXT: s_lshr_b32 s6, s21, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 39 +; GFX9-NEXT: s_lshr_b32 s6, s20, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 40 +; GFX9-NEXT: s_lshr_b32 s6, s20, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 41 +; GFX9-NEXT: s_lshr_b32 s6, s19, 24 +; GFX9-NEXT: v_writelane_b32 v62, s6, 42 +; GFX9-NEXT: s_lshr_b32 s6, s19, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 11 +; GFX9-NEXT: s_lshr_b32 s6, s19, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 43 +; GFX9-NEXT: s_lshr_b32 s6, s18, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 44 +; GFX9-NEXT: s_lshr_b32 s6, s18, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 45 +; GFX9-NEXT: s_lshr_b32 s6, s17, 24 +; GFX9-NEXT: v_writelane_b32 v62, s6, 46 +; GFX9-NEXT: s_lshr_b32 s6, s17, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 10 +; GFX9-NEXT: s_lshr_b32 s6, s17, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 47 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 48 +; GFX9-NEXT: s_lshr_b32 s6, s16, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 49 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[4:5], 24 +; GFX9-NEXT: v_writelane_b32 v62, s40, 8 +; GFX9-NEXT: v_writelane_b32 v62, s41, 9 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[28:29], 24 +; GFX9-NEXT: v_writelane_b32 v62, s40, 6 +; GFX9-NEXT: v_writelane_b32 v62, s41, 7 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX9-NEXT: v_writelane_b32 v62, s40, 4 +; GFX9-NEXT: v_writelane_b32 v62, s41, 5 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[24:25], 24 +; GFX9-NEXT: v_writelane_b32 v62, s40, 2 +; GFX9-NEXT: v_writelane_b32 v62, s41, 3 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[22:23], 24 +; GFX9-NEXT: v_writelane_b32 v62, s40, 0 +; GFX9-NEXT: s_lshr_b32 s70, s47, 24 +; GFX9-NEXT: s_lshr_b32 s15, s47, 16 +; GFX9-NEXT: s_lshr_b32 s7, s47, 8 +; GFX9-NEXT: s_lshr_b32 s53, s46, 16 +; GFX9-NEXT: s_lshr_b32 s52, s46, 8 +; GFX9-NEXT: s_lshr_b32 s67, s57, 24 +; GFX9-NEXT: s_lshr_b32 s14, s57, 16 +; GFX9-NEXT: s_lshr_b32 s69, s57, 8 +; GFX9-NEXT: s_lshr_b32 s6, s56, 16 +; GFX9-NEXT: s_lshr_b32 s71, s56, 8 +; GFX9-NEXT: s_lshr_b32 s64, s59, 24 +; GFX9-NEXT: s_lshr_b32 s13, s59, 16 +; GFX9-NEXT: s_lshr_b32 s66, s59, 8 +; GFX9-NEXT: s_lshr_b32 s51, s58, 16 +; GFX9-NEXT: s_lshr_b32 s68, s58, 8 +; GFX9-NEXT: s_lshr_b32 s99, s61, 24 +; GFX9-NEXT: s_lshr_b32 s12, s61, 16 +; GFX9-NEXT: s_lshr_b32 s55, s61, 8 +; GFX9-NEXT: s_lshr_b32 s50, s60, 16 +; GFX9-NEXT: s_lshr_b32 s65, s60, 8 +; GFX9-NEXT: s_lshr_b32 s96, s63, 24 +; GFX9-NEXT: s_lshr_b32 s11, s63, 16 +; GFX9-NEXT: s_lshr_b32 s98, s63, 8 +; GFX9-NEXT: s_lshr_b32 s49, s62, 16 +; GFX9-NEXT: s_lshr_b32 s54, s62, 8 +; GFX9-NEXT: s_lshr_b32 s85, s73, 24 +; GFX9-NEXT: s_lshr_b32 s10, s73, 16 +; GFX9-NEXT: s_lshr_b32 s87, s73, 8 +; GFX9-NEXT: s_lshr_b32 s48, s72, 16 +; GFX9-NEXT: s_lshr_b32 s97, s72, 8 +; GFX9-NEXT: s_lshr_b32 s82, s75, 24 +; GFX9-NEXT: s_lshr_b32 s9, s75, 16 +; GFX9-NEXT: s_lshr_b32 s84, s75, 8 +; GFX9-NEXT: s_lshr_b32 s39, s74, 16 +; GFX9-NEXT: s_lshr_b32 s86, s74, 8 +; GFX9-NEXT: s_lshr_b32 s80, s77, 24 +; GFX9-NEXT: s_lshr_b32 s8, s77, 16 +; GFX9-NEXT: s_lshr_b32 s81, s77, 8 +; GFX9-NEXT: s_lshr_b32 s38, s76, 16 +; GFX9-NEXT: s_lshr_b32 s83, s76, 8 +; GFX9-NEXT: v_writelane_b32 v62, s41, 1 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[46:47], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[56:57], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[58:59], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[60:61], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[62:63], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[72:73], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[74:75], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[76:77], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB91_4 +; GFX9-NEXT: .LBB91_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s6, s77, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_lshl_b32 s6, s77, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: s_and_b32 s6, s76, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v8, v5, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s6, s76, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: s_and_b32 s6, s75, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_lshl_b32 s6, s75, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v31 +; GFX9-NEXT: s_and_b32 s6, s74, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v14, v5, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s6, s74, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v33 +; GFX9-NEXT: s_and_b32 s6, s73, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v13, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_lshl_b32 s6, s73, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v34 +; GFX9-NEXT: s_and_b32 s6, s72, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v16, v32, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s6, s72, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v36 +; GFX9-NEXT: s_and_b32 s6, s63, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v15, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_lshl_b32 s6, s63, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v37 +; GFX9-NEXT: s_and_b32 s6, s62, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v18, v35, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s6, s62, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v39 +; GFX9-NEXT: s_and_b32 s6, s61, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v17, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_lshl_b32 s6, s61, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v48 +; GFX9-NEXT: s_and_b32 s6, s60, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v20, v38, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s6, s60, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX9-NEXT: s_and_b32 s6, s59, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_lshl_b32 s6, s59, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v51 +; GFX9-NEXT: s_and_b32 s6, s58, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s6, s58, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v53 +; GFX9-NEXT: s_and_b32 s6, s57, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_lshl_b32 s6, s57, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v54 +; GFX9-NEXT: s_and_b32 s6, s56, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v24, v52, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s6, s56, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; GFX9-NEXT: s_and_b32 s6, s47, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v23, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_lshl_b32 s6, s47, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; GFX9-NEXT: s_and_b32 s6, s46, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s6, s46, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; GFX9-NEXT: s_and_b32 s6, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v25, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s11, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s17, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s17, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s16, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s16, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s16, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s19, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s46, s16, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s12, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s19, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s19, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s18, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s18, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s18, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s21, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s56, s18, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s21, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s21, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s20, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s20, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s20, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s23, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s58, s20, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s14, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s23, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s23, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s22, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s22, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s22, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s25, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s60, s22, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s15, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s25, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s25, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s24, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s24, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s24, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s27, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s62, s24, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s76, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s27, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s27, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s26, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s26, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s26, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s29, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s72, s26, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s29, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s29, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s28, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s28, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s28, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s5, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s74, s28, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s5, v1 +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: s_lshr_b32 s78, s6, 16 +; GFX9-NEXT: s_bfe_u32 s6, s5, 0x10010 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_add_i32 s8, s6, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s5, 22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s5, s5, s8 +; GFX9-NEXT: s_and_b32 s6, s4, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s8, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s8, s8, s6 +; GFX9-NEXT: s_lshr_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s10, s8, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s6, 22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s6, s10 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_bfe_u32 s8, s4, 0x10010 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: s_lshr_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s10, s8, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s4, 22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: v_lshrrev_b64 v[2:3], 24, v[23:24] +; GFX9-NEXT: s_cselect_b32 s4, s4, s10 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[21:22] +; GFX9-NEXT: v_lshrrev_b64 v[9:10], 24, v[15:16] +; GFX9-NEXT: s_pack_ll_b32_b16 s47, s17, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s57, s19, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s59, s21, s13 +; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b64 v[10:11], 24, v[13:14] +; GFX9-NEXT: s_pack_ll_b32_b16 s61, s23, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s63, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s73, s27, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s75, s29, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s5, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s4, s6 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[58:59], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[46:47], 24 +; GFX9-NEXT: v_lshrrev_b64 v[5:6], 24, v[17:18] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] +; GFX9-NEXT: s_lshr_b64 s[34:35], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[74:75], 24 +; GFX9-NEXT: s_lshr_b64 s[38:39], s[72:73], 24 +; GFX9-NEXT: s_lshr_b64 s[48:49], s[62:63], 24 +; GFX9-NEXT: s_lshr_b64 s[50:51], s[60:61], 24 +; GFX9-NEXT: s_lshr_b32 s9, s7, 24 +; GFX9-NEXT: s_lshr_b32 s10, s7, 8 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s43, s6, 8 +; GFX9-NEXT: s_lshr_b32 s45, s75, 24 +; GFX9-NEXT: s_lshr_b32 s75, s75, 8 +; GFX9-NEXT: s_lshr_b32 s79, s74, 16 +; GFX9-NEXT: s_lshr_b32 s74, s74, 8 +; GFX9-NEXT: s_lshr_b32 s88, s73, 24 +; GFX9-NEXT: s_lshr_b32 s73, s73, 8 +; GFX9-NEXT: s_lshr_b32 s89, s72, 16 +; GFX9-NEXT: s_lshr_b32 s72, s72, 8 +; GFX9-NEXT: s_lshr_b32 s90, s63, 24 +; GFX9-NEXT: s_lshr_b32 s63, s63, 8 +; GFX9-NEXT: s_lshr_b32 s91, s62, 16 +; GFX9-NEXT: s_lshr_b32 s62, s62, 8 +; GFX9-NEXT: s_lshr_b32 s92, s61, 24 +; GFX9-NEXT: s_lshr_b32 s61, s61, 8 +; GFX9-NEXT: s_lshr_b32 s93, s60, 16 +; GFX9-NEXT: s_lshr_b32 s60, s60, 8 +; GFX9-NEXT: s_lshr_b32 s94, s59, 24 +; GFX9-NEXT: s_lshr_b32 s59, s59, 8 +; GFX9-NEXT: s_lshr_b32 s95, s58, 16 +; GFX9-NEXT: s_lshr_b32 s58, s58, 8 +; GFX9-NEXT: s_lshr_b32 vcc_lo, s57, 24 +; GFX9-NEXT: s_lshr_b32 s57, s57, 8 +; GFX9-NEXT: s_lshr_b32 vcc_hi, s56, 16 +; GFX9-NEXT: s_lshr_b32 s56, s56, 8 +; GFX9-NEXT: s_lshr_b32 s30, s47, 24 +; GFX9-NEXT: s_lshr_b32 s47, s47, 8 +; GFX9-NEXT: s_lshr_b32 s8, s46, 16 +; GFX9-NEXT: s_lshr_b32 s7, s46, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_branch .LBB91_5 +; GFX9-NEXT: .LBB91_3: +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s78, 0 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s79, 1 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr9 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr87 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr85 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr96 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr99 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr7 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s78, 2 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s79, 3 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s78, 4 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s79, 5 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s78, 6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s79, 7 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s78, 8 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s79, 9 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB91_2 +; GFX9-NEXT: .LBB91_4: +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v1, s76 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v1, s77 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v46, s51 +; GFX9-NEXT: v_mov_b32_e32 v56, s50 +; GFX9-NEXT: v_mov_b32_e32 v58, s49 +; GFX9-NEXT: v_mov_b32_e32 v60, s48 +; GFX9-NEXT: v_mov_b32_e32 v27, s39 +; GFX9-NEXT: v_mov_b32_e32 v29, s38 +; GFX9-NEXT: v_mov_b32_e32 v10, s34 +; GFX9-NEXT: v_mov_b32_e32 v11, s36 +; GFX9-NEXT: v_readlane_b32 s34, v62, 8 +; GFX9-NEXT: v_readlane_b32 s36, v62, 6 +; GFX9-NEXT: v_readlane_b32 s38, v62, 4 +; GFX9-NEXT: v_readlane_b32 s48, v62, 2 +; GFX9-NEXT: v_readlane_b32 s50, v62, 0 +; GFX9-NEXT: v_mov_b32_e32 v42, s46 +; GFX9-NEXT: v_mov_b32_e32 v41, s47 +; GFX9-NEXT: v_mov_b32_e32 v55, s15 +; GFX9-NEXT: v_mov_b32_e32 v40, s56 +; GFX9-NEXT: v_mov_b32_e32 v54, s57 +; GFX9-NEXT: v_mov_b32_e32 v52, s14 +; GFX9-NEXT: v_mov_b32_e32 v53, s58 +; GFX9-NEXT: v_mov_b32_e32 v51, s59 +; GFX9-NEXT: v_mov_b32_e32 v49, s13 +; GFX9-NEXT: v_mov_b32_e32 v50, s60 +; GFX9-NEXT: v_mov_b32_e32 v48, s61 +; GFX9-NEXT: v_mov_b32_e32 v38, s12 +; GFX9-NEXT: v_mov_b32_e32 v39, s62 +; GFX9-NEXT: v_mov_b32_e32 v37, s63 +; GFX9-NEXT: v_mov_b32_e32 v35, s11 +; GFX9-NEXT: v_mov_b32_e32 v36, s72 +; GFX9-NEXT: v_mov_b32_e32 v34, s73 +; GFX9-NEXT: v_mov_b32_e32 v32, s10 +; GFX9-NEXT: v_mov_b32_e32 v33, s74 +; GFX9-NEXT: v_mov_b32_e32 v31, s75 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v26, s53 +; GFX9-NEXT: v_mov_b32_e32 v25, s52 +; GFX9-NEXT: v_mov_b32_e32 v6, s70 +; GFX9-NEXT: v_mov_b32_e32 v12, s7 +; GFX9-NEXT: v_mov_b32_e32 v44, s6 +; GFX9-NEXT: v_mov_b32_e32 v23, s71 +; GFX9-NEXT: v_mov_b32_e32 v43, s67 +; GFX9-NEXT: v_mov_b32_e32 v24, s69 +; GFX9-NEXT: v_mov_b32_e32 v21, s68 +; GFX9-NEXT: v_mov_b32_e32 v45, s64 +; GFX9-NEXT: v_mov_b32_e32 v22, s66 +; GFX9-NEXT: v_mov_b32_e32 v19, s65 +; GFX9-NEXT: v_mov_b32_e32 v47, s99 +; GFX9-NEXT: v_mov_b32_e32 v20, s55 +; GFX9-NEXT: v_mov_b32_e32 v17, s54 +; GFX9-NEXT: v_mov_b32_e32 v57, s96 +; GFX9-NEXT: v_mov_b32_e32 v18, s98 +; GFX9-NEXT: v_mov_b32_e32 v15, s97 +; GFX9-NEXT: v_mov_b32_e32 v59, s85 +; GFX9-NEXT: v_mov_b32_e32 v16, s87 +; GFX9-NEXT: v_mov_b32_e32 v13, s86 +; GFX9-NEXT: v_mov_b32_e32 v61, s82 +; GFX9-NEXT: v_mov_b32_e32 v14, s84 +; GFX9-NEXT: v_mov_b32_e32 v7, s83 +; GFX9-NEXT: v_mov_b32_e32 v28, s80 +; GFX9-NEXT: v_mov_b32_e32 v8, s81 +; GFX9-NEXT: v_mov_b32_e32 v1, s78 +; GFX9-NEXT: v_mov_b32_e32 v2, s88 +; GFX9-NEXT: v_mov_b32_e32 v3, s90 +; GFX9-NEXT: v_mov_b32_e32 v4, s92 +; GFX9-NEXT: v_mov_b32_e32 v5, s94 +; GFX9-NEXT: v_mov_b32_e32 v9, s30 +; GFX9-NEXT: v_readlane_b32 s11, v62, 10 +; GFX9-NEXT: v_readlane_b32 s12, v62, 11 +; GFX9-NEXT: v_readlane_b32 s13, v62, 12 +; GFX9-NEXT: v_readlane_b32 s14, v62, 13 +; GFX9-NEXT: v_readlane_b32 s15, v62, 14 +; GFX9-NEXT: v_readlane_b32 s76, v62, 15 +; GFX9-NEXT: v_readlane_b32 s77, v62, 16 +; GFX9-NEXT: v_readlane_b32 s78, v62, 17 +; GFX9-NEXT: v_readlane_b32 s9, v62, 18 +; GFX9-NEXT: v_readlane_b32 s10, v62, 19 +; GFX9-NEXT: v_readlane_b32 s41, v62, 20 +; GFX9-NEXT: v_readlane_b32 s43, v62, 21 +; GFX9-NEXT: v_readlane_b32 s45, v62, 22 +; GFX9-NEXT: v_readlane_b32 s75, v62, 23 +; GFX9-NEXT: v_readlane_b32 s79, v62, 24 +; GFX9-NEXT: v_readlane_b32 s74, v62, 25 +; GFX9-NEXT: v_readlane_b32 s88, v62, 26 +; GFX9-NEXT: v_readlane_b32 s73, v62, 27 +; GFX9-NEXT: v_readlane_b32 s89, v62, 28 +; GFX9-NEXT: v_readlane_b32 s72, v62, 29 +; GFX9-NEXT: v_readlane_b32 s90, v62, 30 +; GFX9-NEXT: v_readlane_b32 s63, v62, 31 +; GFX9-NEXT: v_readlane_b32 s91, v62, 32 +; GFX9-NEXT: v_readlane_b32 s62, v62, 33 +; GFX9-NEXT: v_readlane_b32 s92, v62, 34 +; GFX9-NEXT: v_readlane_b32 s61, v62, 35 +; GFX9-NEXT: v_readlane_b32 s93, v62, 36 +; GFX9-NEXT: v_readlane_b32 s60, v62, 37 +; GFX9-NEXT: v_readlane_b32 s94, v62, 38 +; GFX9-NEXT: v_readlane_b32 s59, v62, 39 +; GFX9-NEXT: v_readlane_b32 s95, v62, 40 +; GFX9-NEXT: v_readlane_b32 s58, v62, 41 +; GFX9-NEXT: v_readlane_b32 vcc_lo, v62, 42 +; GFX9-NEXT: v_readlane_b32 s57, v62, 43 +; GFX9-NEXT: v_readlane_b32 vcc_hi, v62, 44 +; GFX9-NEXT: v_readlane_b32 s56, v62, 45 +; GFX9-NEXT: v_readlane_b32 s30, v62, 46 +; GFX9-NEXT: v_readlane_b32 s47, v62, 47 +; GFX9-NEXT: v_readlane_b32 s8, v62, 48 +; GFX9-NEXT: v_readlane_b32 s7, v62, 49 +; GFX9-NEXT: v_readlane_b32 s35, v62, 9 +; GFX9-NEXT: v_readlane_b32 s37, v62, 7 +; GFX9-NEXT: v_readlane_b32 s39, v62, 5 +; GFX9-NEXT: v_readlane_b32 s49, v62, 3 +; GFX9-NEXT: v_readlane_b32 s51, v62, 1 +; GFX9-NEXT: .LBB91_5: ; %end +; GFX9-NEXT: s_and_b32 s6, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s44, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s47, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s30, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s56, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, vcc_hi, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s42, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s57, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s8, vcc_lo, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s58, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s95, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s40, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s59, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s13, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s94, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s60, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s93, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s50, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s61, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s14, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s92, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s62, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s91, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s48, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s63, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s15, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s90, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s72, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s89, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s38, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s73, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s76, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s88, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s74, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s79, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s36, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s75, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s45, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s43, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s41, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s34, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_mov_b32_e32 v30, s4 +; GFX9-NEXT: s_and_b32 s4, s5, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s10, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s78, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s9, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v30, s4 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v29, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v58, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v56, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_readlane_b32 s99, v63, 35 +; GFX9-NEXT: v_readlane_b32 s98, v63, 34 +; GFX9-NEXT: v_readlane_b32 s97, v63, 33 +; GFX9-NEXT: v_readlane_b32 s96, v63, 32 +; GFX9-NEXT: v_readlane_b32 s87, v63, 31 +; GFX9-NEXT: v_readlane_b32 s86, v63, 30 +; GFX9-NEXT: v_readlane_b32 s85, v63, 29 +; GFX9-NEXT: v_readlane_b32 s84, v63, 28 +; GFX9-NEXT: v_readlane_b32 s83, v63, 27 +; GFX9-NEXT: v_readlane_b32 s82, v63, 26 +; GFX9-NEXT: v_readlane_b32 s81, v63, 25 +; GFX9-NEXT: v_readlane_b32 s80, v63, 24 +; GFX9-NEXT: v_readlane_b32 s71, v63, 23 +; GFX9-NEXT: v_readlane_b32 s70, v63, 22 +; GFX9-NEXT: v_readlane_b32 s69, v63, 21 +; GFX9-NEXT: v_readlane_b32 s68, v63, 20 +; GFX9-NEXT: v_readlane_b32 s67, v63, 19 +; GFX9-NEXT: v_readlane_b32 s66, v63, 18 +; GFX9-NEXT: v_readlane_b32 s65, v63, 17 +; GFX9-NEXT: v_readlane_b32 s64, v63, 16 +; GFX9-NEXT: v_readlane_b32 s55, v63, 15 +; GFX9-NEXT: v_readlane_b32 s54, v63, 14 +; GFX9-NEXT: v_readlane_b32 s53, v63, 13 +; GFX9-NEXT: v_readlane_b32 s52, v63, 12 +; GFX9-NEXT: v_readlane_b32 s51, v63, 11 +; GFX9-NEXT: v_readlane_b32 s50, v63, 10 +; GFX9-NEXT: v_readlane_b32 s49, v63, 9 +; GFX9-NEXT: v_readlane_b32 s48, v63, 8 +; GFX9-NEXT: v_readlane_b32 s39, v63, 7 +; GFX9-NEXT: v_readlane_b32 s38, v63, 6 +; GFX9-NEXT: v_readlane_b32 s37, v63, 5 +; GFX9-NEXT: v_readlane_b32 s36, v63, 4 +; GFX9-NEXT: v_readlane_b32 s35, v63, 3 +; GFX9-NEXT: v_readlane_b32 s34, v63, 2 +; GFX9-NEXT: v_readlane_b32 s31, v63, 1 +; GFX9-NEXT: v_readlane_b32 s30, v63, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v11, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v10 +; GFX9-NEXT: v_or_b32_sdwa v7, v33, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v27, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v7, v31, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v10, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v60, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v7, v34, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v32, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v7, v39, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v5, v37, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v4, v48, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v38, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v4, v53, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v49, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v128i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:12 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s96, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s72, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s73, v2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s97, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s62, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s63, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s60, v5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s98, 2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s61, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s58, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s59, v8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s99, 3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s44, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s45, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s100, 4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s43, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s101, 5 +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s102, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s103, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s104, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s51, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s52, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s53, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s54, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s55, 15 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s64, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s65, 17 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s66, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s67, 19 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s68, 20 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s69, 21 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s70, 22 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s71, 23 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s80, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s81, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s82, 26 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s83, 27 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s84, 28 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s85, 29 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s86, 30 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s87, 31 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s1, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s0, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s41, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s41, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s41, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s40, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s43, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s43, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s43, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s42, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s45, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s45, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s45, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s44, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s44, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s59, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s24, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s14, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s59, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s59, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s15, 9 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s58, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s58, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s61, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s61, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s61, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s60, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s60, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s63, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s63, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s63, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s62, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s73, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s73, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s73, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s72, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s72, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s29, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s29, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s28, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[100:101], s[26:27], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[58:59], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[60:61], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[62:63], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[72:73], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s62, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s14, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s15, 26 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s14, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s15, 19 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[18:19], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s14, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s15, 12 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB91_4 +; GFX11-TRUE16-NEXT: .LBB91_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX11-TRUE16-NEXT: s_and_b32 s14, s58, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s15, s58, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s41, 0xffff0000 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s58, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s41, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s77, s28, 0xffff0000 +; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s58, 0x10010 +; GFX11-TRUE16-NEXT: s_lshl_b32 s78, s28, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s6, s58 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s73, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s76, s73, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s74, s72, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s75, s72, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s12, s63, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s73, s63, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s63, s62, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s72, s62, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s62, s61, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s57, s61, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s47, s60, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s60, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s46, s59, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s28, s59, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s45, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s45, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s45, s44, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s44, s44, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s29, s43, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s43, s43, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s13, s42, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s42, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s40, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s40, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_bitset1_b32 s58, 22 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s58, s41 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s40, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s77 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s78 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s1, 0x10010 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s1 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s1, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s40, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s40 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v4, v6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v51, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s76 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s0, 0x10010 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v4 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s5, s0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s40, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_bitset1_b32 s0, 22 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s40, s3, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s40 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v9 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v1, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v7, 16, v3 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s75 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s40, 16 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s74 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s3, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s3 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s3, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v8, v9 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s3, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s40, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s40 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s73 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s12, s2, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s12, s2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s40, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 22 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s2, s2, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s40, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v24, 16, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s40 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v9, 16, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41 +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s72 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s17, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s63 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s17, 0x10010 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s40, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_bitset1_b32 s17, 22 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s17, s17, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s40, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s40 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v25, 16, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s17, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s62 +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41 +; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s16, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v9, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s57 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s16, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s16 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s16, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s16, s16, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s40, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s40 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v8, 16, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s16, 16 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v6, 16, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s47 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v8 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41 +; GFX11-TRUE16-NEXT: s_lshl_b32 s19, s19, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s56 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s19, v12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s40, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s19, 0x10010 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v10 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_bitset1_b32 s19, 22 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s19, s19, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s40, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v7, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s40 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s19, s19, 16 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s46 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s15 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s28 +; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41 +; GFX11-TRUE16-NEXT: s_lshl_b32 s18, s18, 16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s40, 16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s18, v9 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v28, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v5, 16, v10 +; GFX11-TRUE16-NEXT: s_bfe_u32 s28, s18, 0x10010 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, s18 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s18, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s28, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s18, s18, s28 +; GFX11-TRUE16-NEXT: s_and_b32 s28, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s28 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v11, 16, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s18, 16 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v11 +; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s14, 0x10010 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v10, 16, 1 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s28, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s14, s14, s15 +; GFX11-TRUE16-NEXT: s_lshl_b32 s15, s21, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s15 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s14, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v8, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v9, v12 +; GFX11-TRUE16-NEXT: s_bfe_u32 s21, s15, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, s15 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s15, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s21, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s14, s15, s21 +; GFX11-TRUE16-NEXT: s_and_b32 s15, s20, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s15 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s14, 16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s0, s5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s11 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s2, s12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s11, 0x10010 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v33 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s11 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s11, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s11, s15 +; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s20, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v6, 16, v11 +; GFX11-TRUE16-NEXT: s_bfe_u32 s9, s14, 0x10010 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s9, s14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s11, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff +; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s14, s15 +; GFX11-TRUE16-NEXT: s_and_b32 s14, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s11, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s45 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s44 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s14, 0x10010 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v10, 16, 1 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s14, s15 +; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v8, v10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s11, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v9, v11 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s14, 0x10010 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s14, s15 +; GFX11-TRUE16-NEXT: s_and_b32 s14, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s11, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s43 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s29 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v71, v35, 16, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s13 +; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s14, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s14, s15 +; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s11, 16 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v70, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v11 +; GFX11-TRUE16-NEXT: s_bfe_u32 s13, s14, 0x10010 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v10 +; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, s14 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s13, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s15, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s14, s13 +; GFX11-TRUE16-NEXT: s_and_b32 s14, s25, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s13, 16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-TRUE16-NEXT: s_bfe_u32 s14, s10, 0x10010 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: s_add_i32 s14, s14, s10 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s10, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s14, 0x7fff +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s10, s10, s14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s10, 16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10 +; GFX11-TRUE16-NEXT: s_bfe_u32 s13, s8, 0x10010 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v11, 16, 1 +; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, s8 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s13, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s8, s13 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s8, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v48, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_bfe_u32 s10, s7, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, s7 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s7, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s7, s7, s10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s24, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s8 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v10, 16, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s7, 16 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v10 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v51, 16, 1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s4, s8, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v6 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, s8 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s8, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s27, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v52, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v12, v51 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s4, 16 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v11, 16, 1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v50 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, s6 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s6, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s6, s7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v8, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v51 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s4, 16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v66, v5, 16, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v67, v38, 16, v9 +; GFX11-TRUE16-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, s6 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s6, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s6, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s4, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v55, v48, 16, v8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s1, s58 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v54, v6, 16, v10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s3, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s17, s60 +; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s6, 0x10010 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s16, s42 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s6, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s6, s5 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s13, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s27, s73 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[54:55] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[6:7], 24, v[66:67] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[70:71] +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s12, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[18:19] +; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, s12 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s12, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s11, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s12, s12, s11 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[16:17] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[14:15] +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s12, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[1:2] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s26, s13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 24, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 8, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v67 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v67 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v66 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 24, v71 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 8, v71 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v70 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[44:45], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[8:9], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[42:43], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[40:41], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[36:37], s[10:11], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[38:39], s[4:5], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s45, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s45, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s44, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s44, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s10, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s9, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s8, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s7, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s7, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s6, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s5, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s5, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s4, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s43, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s43, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s41, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s41, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s40, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s29, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s28, 8 +; GFX11-TRUE16-NEXT: s_branch .LBB91_5 +; GFX11-TRUE16-NEXT: .LBB91_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 10 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s5, 12 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 13 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 14 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 15 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 17 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s5, 19 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 20 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 21 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 22 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 23 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 24 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s5, 26 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 27 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 28 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 29 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 30 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 31 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s74, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s75, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB91_2 +; GFX11-TRUE16-NEXT: .LBB91_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s94, v43, 25 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v43, 18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s39 +; GFX11-TRUE16-NEXT: v_readlane_b32 s95, v43, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v43, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v42, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v43, 11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s87 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s96 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s59 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s61 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s72 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s73 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s97 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s98 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s69 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s70 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s71 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, s80 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s81 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s64 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s82 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, s51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s65 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, s83 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, s85 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, s52 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s67 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, s86 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, s54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, s68 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s74 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s76 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s78 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s88 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s90 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s92 +; GFX11-TRUE16-NEXT: v_readlane_b32 s58, v43, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s59, v43, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s60, v43, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s61, v43, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s62, v43, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s63, v43, 28 +; GFX11-TRUE16-NEXT: v_readlane_b32 s72, v42, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s73, v42, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s13, v42, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s15, v42, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s45, v42, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s44, v42, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s47, v42, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v42, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s57, v43, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v43, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s74, v43, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s9, v43, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s75, v43, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s8, v43, 23 +; GFX11-TRUE16-NEXT: v_readlane_b32 s76, v43, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s77, v43, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s78, v43, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s79, v43, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s88, v43, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s89, v43, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s90, v43, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s91, v43, 9 +; GFX11-TRUE16-NEXT: s_mov_b32 s92, s100 +; GFX11-TRUE16-NEXT: v_readlane_b32 s93, v43, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s43, v43, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s95, v43, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v42, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s42, v43, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s41, v43, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v43, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v43, 3 +; GFX11-TRUE16-NEXT: .LBB91_5: ; %end +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s34 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s104 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s56 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s4 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s103 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s58 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s102 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s4 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s99 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s46 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s4 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s59 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s31 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s4 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s95 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s14 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s60 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s93 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s91 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s90 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s38 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s89 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s61 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s88 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s79 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s78 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s30 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s62 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s76 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s75 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s94 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s63 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s74 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s57 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s36 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s72 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s47 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s45 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s92 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s15 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s73 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s6, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s13 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v100, 8, v100 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v101, s2 :: v_dual_lshlrev_b32 v12, 8, v12 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v102, s3 :: v_dual_and_b32 v99, 0xff, v99 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xff, v87 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v99, v12 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v99, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v84, 0xff, v84 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v98 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v87, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v22, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v96 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v29, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v24, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v84, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v26, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v21, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v22, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v12, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v20, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v28, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v10, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v18, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v19, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v20, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v22, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v21, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v28, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v30, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v100, s1 :: v_dual_lshlrev_b32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v12, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v19, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v13 +; GFX11-TRUE16-NEXT: s_clause 0x5 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[99:102], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[14:17], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[18:21], off offset:112 +; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v41, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v41, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v41, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v41, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v41, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v41, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v41, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v41, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v41, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v40, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v40, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v40, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v40, 28 +; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v40, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v40, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v40, 25 +; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v40, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v40, 23 +; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v40, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v40, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v40, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v40, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v40, 18 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v40, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v40, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v40, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v40, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v40, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v40, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v40, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v40, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v40, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:12 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:12 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s96, 0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s72, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s73, v2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s97, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s62, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s63, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s60, v5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s98, 2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s61, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s58, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s59, v8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s99, 3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s56, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s57, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s46, v11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s100, 4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s47, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s45, v14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s101, 5 +; GFX11-FAKE16-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s102, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s103, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s104, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s50, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s51, 11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s52, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s53, 13 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s54, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s55, 15 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s64, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s65, 17 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s66, 18 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s67, 19 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s68, 20 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s69, 21 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s70, 22 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s71, 23 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s80, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s81, 25 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s82, 26 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s83, 27 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s84, 28 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s85, 29 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s86, 30 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s87, 31 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[26:27], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 15 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s1, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s0, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s45, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s45, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s45, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s44, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s44, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s47, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s47, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s47, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s46, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s57, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s57, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 13 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s57, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s56, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s56, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s59, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s59, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s59, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s58, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s58, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s61, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s61, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s61, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s60, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s60, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s63, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s63, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s63, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s62, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s62, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s73, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s73, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s73, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s72, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s72, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s29, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s28, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s28, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[2:3], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[44:45], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[46:47], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[56:57], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[58:59], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[60:61], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[62:63], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[72:73], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s18, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s18, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s46, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 7 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 5 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 1 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB91_4 +; GFX11-FAKE16-NEXT: .LBB91_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX11-FAKE16-NEXT: s_and_b32 s14, s47, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s1, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s47, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s45, 0xffff0000 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s47, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s45, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s78, s28, 0xffff0000 +; GFX11-FAKE16-NEXT: s_bfe_u32 s6, s47, 0x10010 +; GFX11-FAKE16-NEXT: s_lshl_b32 s79, s28, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s6, s47 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s73, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s77, s73, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s75, s72, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s76, s72, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s63, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s74, s63, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s72, s62, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s73, s62, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s63, s61, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s62, s61, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s61, s60, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s60, s60, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s41, s59, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s40, s59, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s28, s58, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s29, s58, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s13, s57, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s57, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s42, s56, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s56, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s12, s46, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s46, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s44, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s44, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_bitset1_b32 s47, 22 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s47, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s44, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v53, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v51 -; GFX11-FAKE16-NEXT: v_perm_b32 v87, v29, v28, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v49, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v55, v51, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v32, v53, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v49, v50, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v54 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v49, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v52, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v49 -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v49, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v98, v50, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v51, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 16, v98 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 8, v98 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v50, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v55, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v54 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s78 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s79 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v53, 16, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v51, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX11-FAKE16-NEXT: v_perm_b32 v100, v3, v50, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v53, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v66, v53, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v53 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v97, v31, v30, 0x7060302 -; GFX11-FAKE16-NEXT: v_perm_b32 v103, v3, v51, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v3 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v6, v54 :: v_dual_add_f32 v6, 0x40c00000, v55 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v52, v66, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s1, 0x10010 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s1 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s1, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s44 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX11-FAKE16-NEXT: v_perm_b32 v102, v5, v53, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v54, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v99, v2, v1, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v8, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v52, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v67, v54, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v66, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v7, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v8, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v66 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v6 +; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v55, 16, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v52, v67, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v8, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s77 +; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s0, 0x10010 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v4 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s5, s0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s44, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_bitset1_b32 s0, 22 +; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s0, s45 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s3, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s44 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v2, 16, v3 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX11-FAKE16-NEXT: v_perm_b32 v182, v7, v54, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v8, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_bfe_u32 v112, v55, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v183, v5, v6, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v52, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s76 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s44, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s75 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 24, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s3, 0x10010 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v10, v112, v55, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v67, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v9, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-FAKE16-NEXT: v_perm_b32 v101, v4, v49, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v4 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v55, v10, v66 :: v_dual_add_f32 v10, 0x40c00000, v67 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s3 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s3, 22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s3, s45 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s2, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s44 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v49 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v52, v112 :: v_dual_lshlrev_b32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s74 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v25, 16, v5 +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v14 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s2, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s11 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: s_bfe_u32 s11, s2, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s11, s2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s44, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_bitset1_b32 s2, 22 +; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s44 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s73 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s17 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s72 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s17, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s44, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s17, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v27 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s17 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s17, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v28, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX11-FAKE16-NEXT: s_cselect_b32 s17, s17, s45 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s17, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 24, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v5, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v29 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v8, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s44 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s44, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s16, v8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s62 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s16, 0x10010 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s16 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s16, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s16, s16, s45 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s44 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s16, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v10 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v176, v9, v55, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v52 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v66, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v67, v12, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v9, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 0x40c00000, v11 :: v_dual_cndmask_b32 v10, v52, v66 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v113, v9, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v177, v7, v8, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v67, v112 :: v_dual_lshlrev_b32 v67, 16, v14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v112, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s61 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s19, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s19 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s44, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s19, v10 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v9, v8 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s19, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s19 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s19, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s19, s19, s45 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s18, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s44 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s19, s19, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s29 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s41 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s47, s17, s72 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s44, s41, 0x10010 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_add_i32 s44, s44, s41 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s41, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s44, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s45, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s41, s41, s44 +; GFX11-FAKE16-NEXT: s_lshl_b32 s18, s18, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v30, 16, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s18, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v1, 16, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-FAKE16-NEXT: s_bfe_u32 s40, s18, 0x10010 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s28 +; GFX11-FAKE16-NEXT: s_add_i32 s44, s40, s18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s41, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s44, 0x7fff +; GFX11-FAKE16-NEXT: s_bitset1_b32 s18, 22 +; GFX11-FAKE16-NEXT: s_and_b32 s41, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s18, s18, s44 +; GFX11-FAKE16-NEXT: s_and_b32 s41, s21, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s41 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v9, 16, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s18, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s28, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v10, 16, 1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s29, s28, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, s28 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s28, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s41, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s28, s28, s29 +; GFX11-FAKE16-NEXT: s_lshl_b32 s21, s21, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s21 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v26 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v52, v66 :: v_dual_add_f32 v52, 0x40c00000, v67 -; GFX11-FAKE16-NEXT: v_add3_u32 v66, v112, v11, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v13 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s44, s2, s11 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s21, v11 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v66, v67 :: v_dual_add_f32 v66, 0x40c00000, v112 -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v113, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v14, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v66, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v66 -; GFX11-FAKE16-NEXT: v_perm_b32 v162, v11, v9, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v67, v112, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v113, v14, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v14 -; GFX11-FAKE16-NEXT: v_add3_u32 v113, v114, v66, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 16, v16 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v13, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v67, v112 :: v_dual_add_f32 v67, 0x40c00000, v114 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 -; GFX11-FAKE16-NEXT: v_add3_u32 v112, v116, v13, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v149, v14, v52, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v67, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v113, v115, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v113, 0x400000, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-FAKE16-NEXT: v_perm_b32 v163, v12, v10, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v112, v113 :: v_dual_add_f32 v112, 0x40c00000, v115 -; GFX11-FAKE16-NEXT: v_add3_u32 v113, v114, v67, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v67 -; GFX11-FAKE16-NEXT: v_bfe_u32 v115, v16, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 -; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v112, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v117, 0x400000, v112 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v15 -; GFX11-FAKE16-NEXT: v_perm_b32 v148, v13, v66, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v113, v114, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v114, v115, v16, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v16 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v15, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v116, v116, v112, 0x7fff -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v33 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[96:97] -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v114, v115, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v112, v112 -; GFX11-FAKE16-NEXT: v_add3_u32 v113, v113, v15, 0x7fff -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[86:87] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[84:85] -; GFX11-FAKE16-NEXT: v_perm_b32 v135, v16, v67, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v112, v116, v117, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v52 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v53 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v113, v118, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX11-FAKE16-NEXT: v_perm_b32 v134, v15, v112, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v112 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v48 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v51 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[134:135] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[148:149] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[162:163] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[176:177] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[182:183] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[82:83] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v67 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v66 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v54 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[102:103] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[98:99] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[80:81] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v55 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v39 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[100:101] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[70:71] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[68:69] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v135 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v135 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v134 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v134 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v149 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v149 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v148 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v148 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v163 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 8, v163 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v162 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v162 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 24, v177 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v177 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v176 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v176 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v183 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v183 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v182 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v182 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v103 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v103 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v102 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v102 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 24, v101 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v101 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v100 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v100 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v99 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v99 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v97 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v96 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 24, v87 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v86 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 24, v85 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v84 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v82 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v81 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v71 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v77 -; GFX11-FAKE16-NEXT: .LBB45_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v76 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v63 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v75 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v60 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v66, v54 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v166 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v62 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v160 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v65, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v44 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v65, v53 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v147 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s29, s21, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, s21 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s21, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s28, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s21, s21, s29 +; GFX11-FAKE16-NEXT: s_and_b32 s28, s20, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s28 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s21, 16 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s45, s3, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s46, s16, s46 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_bfe_u32 s28, s13, 0x10010 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v34 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, s13 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s13, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s29, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s13, s28 +; GFX11-FAKE16-NEXT: s_lshl_b32 s20, s20, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s20, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v2, 16, v9 +; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s20, 0x10010 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s10, s20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s13, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff +; GFX11-FAKE16-NEXT: s_bitset1_b32 s20, 22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 +; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s20, s28 +; GFX11-FAKE16-NEXT: s_and_b32 s20, s23, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s42 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s20 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s43 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s28, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_bfe_u32 s20, s28, 0x10010 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s20, s28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s13, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff +; GFX11-FAKE16-NEXT: s_bitset1_b32 s28, 22 +; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s28, s29 +; GFX11-FAKE16-NEXT: s_lshl_b32 s23, s23, 16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s13, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s23, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s28, s23, 0x10010 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, s23 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s23, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s23, s28 +; GFX11-FAKE16-NEXT: s_and_b32 s23, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s23 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s13, 16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v71, v37, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s12 +; GFX11-FAKE16-NEXT: s_bfe_u32 s15, s14, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: s_add_i32 s15, s15, s14 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s14, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s15, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s14, s15 +; GFX11-FAKE16-NEXT: s_lshl_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s13, 16 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v70, v2, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9 +; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s14, 0x10010 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8 +; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, s14 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s14, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s12, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s15, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s12, s14, s12 +; GFX11-FAKE16-NEXT: s_and_b32 s14, s25, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s12, 16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: s_bfe_u32 s14, s9, 0x10010 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: s_add_i32 s14, s14, s9 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s14, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s9, s9, s14 +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s9, 16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s8, 0x10010 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1 +; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, s8 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s12, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_cselect_b32 s8, s8, s12 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s8, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v12, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s0, s5 +; GFX11-FAKE16-NEXT: s_bfe_u32 s9, s7, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, s7 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s7, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s7, s7, s9 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s24, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v10 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v12, 16, 1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s4, s8, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, s8 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s8, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s27, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v52, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s4, 16 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v9, 16, 1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, s6 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s6, s7 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v4, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v49 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v66, v1, 16, v11 +; GFX11-FAKE16-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, s6 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s4, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v55, v50, 16, v4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s22, s13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v54, v2, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v48, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[17:18] +; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s6, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[15:16] +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s14, s6, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s26, 16 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s20, s10 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s14, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[6:7] +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s1, s58 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[1:2], 24, v[54:55] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[2:3], 24, v[66:67] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[70:71] +; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s11, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[4:5], 24, v[19:20] +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, s11 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s19, s60 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s10, 16 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s18, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s23, s62 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 24, v55 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 8, v55 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v54 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 24, v67 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 8, v67 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v66 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 24, v71 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 8, v71 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v70 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 8, v6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s21, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s25, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s57, s27, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s56, s26, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s24, s12 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[8:9], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[4:5], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[46:47], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[44:45], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 vcc, s[56:57], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[34:35], s[10:11], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s57, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s57, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s56, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s56, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s11, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s10, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s9, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s8, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s7, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s7, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s5, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s5, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s4, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s47, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s47, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s46, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s46, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s45, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s45, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s44, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s44, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s29, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s28, 8 +; GFX11-FAKE16-NEXT: s_branch .LBB91_5 +; GFX11-FAKE16-NEXT: .LBB91_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s5, 1 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s5, 3 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s74, 4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s75, 5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s74, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s75, 7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: s_branch .LBB91_2 +; GFX11-FAKE16-NEXT: .LBB91_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s94 :: v_dual_mov_b32 v11, s30 +; GFX11-FAKE16-NEXT: v_readlane_b32 s94, v43, 2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s49 :: v_dual_mov_b32 v7, s35 +; GFX11-FAKE16-NEXT: v_readlane_b32 s95, v43, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 vcc_lo, v43, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v43, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v43, 4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s44 :: v_dual_mov_b32 v51, s45 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v49, s46 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v39, s47 :: v_dual_mov_b32 v48, s98 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s56 :: v_dual_mov_b32 v37, s97 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s57 :: v_dual_mov_b32 v35, s58 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s59 :: v_dual_mov_b32 v33, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s60 :: v_dual_mov_b32 v31, s61 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v29, s62 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s63 :: v_dual_mov_b32 v28, s96 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s72 :: v_dual_mov_b32 v25, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s73 :: v_dual_mov_b32 v23, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s29 :: v_dual_mov_b32 v22, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, s87 :: v_dual_mov_b32 v54, s86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s85 :: v_dual_mov_b32 v12, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v65, s4 :: v_dual_mov_b32 v66, s48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v55, s81 :: v_dual_mov_b32 v64, s84 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v69, s83 :: v_dual_mov_b32 v70, s82 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v67, s70 :: v_dual_mov_b32 v68, s80 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s71 :: v_dual_mov_b32 v19, s39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v71, s66 :: v_dual_mov_b32 v20, s69 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s68 :: v_dual_mov_b32 v17, s67 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v81, s55 :: v_dual_mov_b32 v18, s65 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, s38 :: v_dual_mov_b32 v15, s64 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v83, s51 :: v_dual_mov_b32 v16, s54 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v13, s52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v14, s50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s78 :: v_dual_mov_b32 v4, s88 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92 +; GFX11-FAKE16-NEXT: s_mov_b32 s58, s11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s59, v43, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s72, v43, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s60, v43, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s61, v43, 11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s62, v43, 12 +; GFX11-FAKE16-NEXT: v_readlane_b32 s63, v43, 13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s73, v43, 14 +; GFX11-FAKE16-NEXT: v_readlane_b32 s13, v43, 15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s15, v43, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s41, v43, 17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s43, v43, 18 +; GFX11-FAKE16-NEXT: v_readlane_b32 s56, v43, 19 +; GFX11-FAKE16-NEXT: v_readlane_b32 s11, v43, 20 +; GFX11-FAKE16-NEXT: v_readlane_b32 s57, v43, 21 +; GFX11-FAKE16-NEXT: v_readlane_b32 s10, v43, 22 +; GFX11-FAKE16-NEXT: v_readlane_b32 s74, v43, 23 +; GFX11-FAKE16-NEXT: v_readlane_b32 s9, v43, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s75, v43, 25 +; GFX11-FAKE16-NEXT: v_readlane_b32 s8, v43, 26 +; GFX11-FAKE16-NEXT: v_readlane_b32 s76, v43, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s77, v43, 28 +; GFX11-FAKE16-NEXT: v_readlane_b32 s78, v43, 29 +; GFX11-FAKE16-NEXT: v_readlane_b32 s79, v43, 30 +; GFX11-FAKE16-NEXT: v_readlane_b32 s88, v43, 31 +; GFX11-FAKE16-NEXT: v_readlane_b32 s89, v42, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s90, v42, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s91, v42, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s92, v42, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s47, v42, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s93, v42, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 vcc_hi, v43, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s46, v42, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v43, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s95, v42, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s45, v42, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v43, 5 +; GFX11-FAKE16-NEXT: .LBB91_5: ; %end +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s104, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s103, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s42, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s4 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s102, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s58, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s101, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s100, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s99, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s40, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s4 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s45, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s59, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s95, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s4 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s46, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s93, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s14, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s47, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s72, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s92, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s91, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s90, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s12, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s89, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s60, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s88, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s79, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s78, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s30, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s77, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s61, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s76, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s75, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s94, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s9, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s62, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s74, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[97:100], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:16 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s10, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s57, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s34, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s11, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s25, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s3, s63, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s56, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s43, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s41, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, vcc_lo, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s15, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s27, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s13, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s73, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_and_b32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v113, s1 :: v_dual_lshlrev_b32 v6, 8, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_lshlrev_b32 v11, 8, v11 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v115, s3 :: v_dual_and_b32 v96, 0xff, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v96, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v180 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v132 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v167 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v119 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v161 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v118 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v149 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v65, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v6, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v11, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v22, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v26, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v24, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v22, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v24, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v26, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v148 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v145 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v144 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v116 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v131 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v129 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v74 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v72 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v67 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v39, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v59 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v57 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v47 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v46 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v146 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v43 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v40 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v133 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v38, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v6, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v11, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v13, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v15, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v17, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v14, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v21, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v178 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v128 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v117 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v103 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v36, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 8, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v18, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v19, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v50 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v18, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v20, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v21, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v22, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v16 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v115 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v87 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v113 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v100 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v99 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v112 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v98 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v31, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v33, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v10, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v11, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v5 ; GFX11-FAKE16-NEXT: s_clause 0x5 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x15 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[97:100], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[6:9], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 +; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v41, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v41, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v41, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v41, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v41, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v41, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v41, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v41, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v41, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v40, 31 +; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v40, 30 +; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v40, 29 +; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v40, 28 +; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v40, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v40, 26 +; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v40, 25 +; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v40, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v40, 23 +; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v40, 22 +; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v40, 21 +; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v40, 20 +; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v40, 19 +; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v40, 18 +; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v40, 17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v40, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v40, 15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v40, 14 +; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v40, 13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v40, 12 +; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v40, 11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v40, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v40, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:12 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -86777,2301 +179220,2298 @@ end: } define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v128i8_to_v64f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v50, v27 -; GCN-NEXT: v_mov_b32_e32 v49, v25 -; GCN-NEXT: v_mov_b32_e32 v39, v21 -; GCN-NEXT: v_mov_b32_e32 v48, v3 -; GCN-NEXT: v_mov_b32_e32 v37, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v10 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v12 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v14 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v20 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:392 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v21 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v27 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(4) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v3 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v4 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v3 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v57, 8, v4 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:388 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:384 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v2 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:152 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:216 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:248 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:280 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:312 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:344 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:376 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:368 -; GCN-NEXT: s_waitcnt vmcnt(13) -; GCN-NEXT: v_lshlrev_b32_e32 v63, 8, v3 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB46_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_mov_b32_e32 v26, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v37 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v7 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v11 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v13 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v15 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v0, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 -; GCN-NEXT: v_or_b32_e32 v0, v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v19 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v0, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v39 -; GCN-NEXT: v_or_b32_e32 v25, v0, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v23 -; GCN-NEXT: v_or_b32_e32 v23, v0, v24 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v49 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v0, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GCN-NEXT: v_or_b32_e32 v33, v0, v28 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v29 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v0, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v12 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v0, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v14 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v2, v4 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v37, v2, v4 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v2, v4 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v2, v4 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v2, v4 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v6, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v6, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v51 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: v_mov_b32_e32 v7, v8 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v12, v5 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v30 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v12, v5 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v7 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v5 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v53, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v5 -; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v40, v40, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v26 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v13 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v42, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v15 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v43, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v27 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v46, v46, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v51, v51, v52 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v52, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v52, v52, v54 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v54, v54, v55 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v56 -; GCN-NEXT: v_or_b32_e32 v55, v55, v41 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v41, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v41, v41, v44 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v44, v44, v45 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v45, v45, v47 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v58 -; GCN-NEXT: v_or_b32_e32 v47, v47, v57 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v56, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v56, v56, v62 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v31, v57, v31 -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v60 -; GCN-NEXT: v_or_b32_e32 v57, v57, v63 -; GCN-NEXT: v_and_b32_e32 v58, 0xff, v61 -; GCN-NEXT: v_or_b32_e32 v1, v58, v1 -; GCN-NEXT: v_and_b32_e32 v58, 0xff, v59 -; GCN-NEXT: v_or_b32_e32 v3, v58, v3 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v25 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v23 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v32 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v33 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v34 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v36 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v49 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v50 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v40 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v42 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v43 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v46 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v3 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: .LBB46_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB46_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v59 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v6, v3, v2 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v61 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v10, v1, v2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v60 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v16, v63, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v20, v31, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v1, v62, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v58 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v57, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v1, v47, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v1, v45, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v1, v44, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v56 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v41, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v1, v55, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v1, v54, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v1, v52, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v46 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v43 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v3, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v0, v24 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v40 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v0, v26 -; GCN-NEXT: v_mov_b32_e32 v0, v37 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v4, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v8 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v1, v34 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v21, v39 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v51 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v39, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, v48 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v48 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v25, v49 -; GCN-NEXT: v_mov_b32_e32 v27, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v49 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v50, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v50 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v50 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v51, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v51 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v51 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v52, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v52, 0xff, v52 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v52 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v53 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v54 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v54 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v55 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v2, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v12 -; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v2, v41 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v29 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v42 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v4, v43 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v25 -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v44 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v45, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v45 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v45 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v21 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 -; GCN-NEXT: v_or_b32_e32 v46, v22, v46 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v19 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v47, v2, v47 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v17 -; GCN-NEXT: v_and_b32_e32 v56, 0xff, v56 -; GCN-NEXT: v_or_b32_e32 v56, v18, v56 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 3, v15 -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v57 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v57, v2, v57 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 3, v13 -; GCN-NEXT: v_and_b32_e32 v58, 0xff, v58 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v58, v2, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v11 -; GCN-NEXT: v_and_b32_e32 v59, 0xff, v59 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v59, v2, v59 -; GCN-NEXT: v_add_i32_e32 v60, vcc, 3, v9 -; GCN-NEXT: v_and_b32_e32 v60, 0xff, v60 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v60, v2, v60 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 3, v7 -; GCN-NEXT: v_and_b32_e32 v61, 0xff, v61 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v61, v2, v61 -; GCN-NEXT: v_add_i32_e32 v62, vcc, 3, v5 -; GCN-NEXT: v_and_b32_e32 v62, 0xff, v62 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v2, v62 -; GCN-NEXT: v_add_i32_e32 v63, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v63, 0xff, v63 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v1, v63 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v45, v0, v3 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 0x300, v6 -; GCN-NEXT: v_add_i32_e32 v43, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v42, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v41, vcc, s6, v20 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v40, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v55, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v54, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v53, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v51, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v49, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v39, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v38, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v37, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v36, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v35, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v34, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v33, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v24 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v26 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v8 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v46, vcc, s6, v46 -; GCN-NEXT: v_add_i32_e32 v47, vcc, s6, v47 -; GCN-NEXT: v_add_i32_e32 v56, vcc, s6, v56 -; GCN-NEXT: v_add_i32_e32 v57, vcc, s6, v57 -; GCN-NEXT: v_add_i32_e32 v58, vcc, s6, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, s6, v59 -; GCN-NEXT: v_add_i32_e32 v60, vcc, s6, v60 -; GCN-NEXT: v_add_i32_e32 v61, vcc, s6, v61 -; GCN-NEXT: v_add_i32_e32 v62, vcc, s6, v62 -; GCN-NEXT: v_add_i32_e32 v63, vcc, s6, v63 -; GCN-NEXT: v_add_i32_e32 v45, vcc, s6, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v63 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v62 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v61 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v60 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v59 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v58 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v57 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v56 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v47 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v46 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v14 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v16 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v20 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v22 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v31 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v33 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v44 -; GCN-NEXT: .LBB46_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, v50, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v50 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v45, v1, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v44, v1, v0 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v47, v1, v0 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v46, v1, v0 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v56, vcc, 20, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v48, vcc, 28, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v14, vcc, 36, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v18, vcc, 40, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v22, vcc, 44, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v63, v1, v0 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 48, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v60, v1, v0 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 52, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v30, v1, v0 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 56, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v61, v1, v0 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v7, v1, v0 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 64, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v62, v1, v0 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x44, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v5, v1, v0 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x48, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v52, v12, v0 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x4c, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v54, v12, v0 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v32 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v1, v11, v0 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x54, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v11, v11, v0 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x58, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v15, v12, v0 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x5c, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v19, v6, v0 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x60, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v9, v6, v0 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x64, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v13, v2, v0 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x68, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v17, v2, v0 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x6c, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v21, v2, v0 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x70, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v25, v2, v0 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x74, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v29, v2, v0 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x78, v50 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v50 -; GCN-NEXT: buffer_store_dword v45, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, v48, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, v14, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, v18, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v128i8_to_v64f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_mov_b32_e32 v43, v17 +; SI-NEXT: v_mov_b32_e32 v41, v7 +; SI-NEXT: v_mov_b32_e32 v55, v5 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_mov_b32_e32 v30, v0 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:392 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:176 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v53 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:384 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:216 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:344 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:376 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v48, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 +; SI-NEXT: v_or_b32_e32 v45, v46, v45 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v56, v56, v61 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v31, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v23, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v29, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v29 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v51, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v4, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v33, v6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v35, v8, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v8, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v34, v8, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v8, v53 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v16, v8, v40 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v20, v8, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v8, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v24, v12, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v26, v12, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v28, v12, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v12, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v18, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v22, v22, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v32, v32, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v36, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v36, v36, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v37, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v37, v37, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v38, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v38, v38, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v39, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v39, v39, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v48, v48, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v49, v49, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v50, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v50, v50, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v9, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v54, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v54, v54, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v52, v52, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v11, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v53, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v53, v53, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v55, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v55, v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v13, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v41, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v41, v41, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v40, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v40, v40, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v39, v40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v42, v42, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v43, 0xff, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v43, v43, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v34, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v44, 0xff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v44, v44, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v48, v44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v46, 0xff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v46, v46, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v58, 0xff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v58, v58, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v59, 0xff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v59, v59, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v60, 0xff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v60, v60, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v61, 0xff, v21 +; SI-NEXT: v_or_b32_e32 v21, v61, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v61, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v47, 0xff, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v61, v61, v63 +; SI-NEXT: v_or_b32_e32 v3, v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v3 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: .LBB92_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v4, v3, v2 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v5, v1, v2 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v35, 0xff, v35 +; SI-NEXT: v_mov_b32_e32 v1, v17 +; SI-NEXT: v_mov_b32_e32 v17, v43 +; SI-NEXT: v_mov_b32_e32 v19, v44 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v23, v63, v2 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v63, 0xff, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v25, v47, v2 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v27, v62, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v29, v61, v2 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v61, 0xff, v61 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v62, 0xff, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v31, v60, v2 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v60, 0xff, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v33, v59, v2 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v59, 0xff, v59 +; SI-NEXT: v_or_b32_e32 v11, v12, v59 +; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v34, v58, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v36, v45, v2 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v58, 0xff, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v37, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v38, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v39, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v48, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v0, v8 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v40, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v53, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v50, 0xff, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 +; SI-NEXT: v_or_b32_e32 v7, v7, v52 +; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v53, 0xff, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 +; SI-NEXT: v_or_b32_e32 v32, v32, v42 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v48 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v45, 0xff, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v0, v47 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v34 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v0, v56 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v33 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v0, v57 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v31 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v0, v58 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v29 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v0, v60 +; SI-NEXT: v_or_b32_e32 v0, v2, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v6, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v63, v0, v63 +; SI-NEXT: v_or_b32_e32 v0, v1, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v63, vcc, s6, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v46 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v58 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v56 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: .LBB92_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v30 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v30 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x7c, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v64f16: ; VI: ; %bb.0: @@ -89405,7 +181845,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB46_2 +; VI-NEXT: s_cbranch_execz .LBB92_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload @@ -89887,9 +182327,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: .LBB46_2: ; %Flow +; VI-NEXT: .LBB92_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB46_4 +; VI-NEXT: s_cbranch_execz .LBB92_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload @@ -90276,7 +182716,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v39, 0x300, v48 ; VI-NEXT: v_or_b32_e32 v21, v39, v21 ; VI-NEXT: v_or_b32_e32 v31, v31, v54 -; VI-NEXT: .LBB46_4: ; %end +; VI-NEXT: .LBB92_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -90652,7 +183092,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB46_2 +; GFX9-NEXT: s_cbranch_execz .LBB92_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload @@ -91135,9 +183575,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: .LBB46_2: ; %Flow +; GFX9-NEXT: .LBB92_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB46_4 +; GFX9-NEXT: s_cbranch_execz .LBB92_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload @@ -91528,7 +183968,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v26, v37, v26, s6 ; GFX9-NEXT: v_perm_b32 v27, v36, v27, s6 ; GFX9-NEXT: v_perm_b32 v28, v35, v28, s6 -; GFX9-NEXT: .LBB46_4: ; %end +; GFX9-NEXT: .LBB92_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -91773,15 +184213,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB46_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB92_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB46_4 -; GFX11-TRUE16-NEXT: .LBB46_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB92_4 +; GFX11-TRUE16-NEXT: .LBB92_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB46_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB92_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.l @@ -92040,8 +184480,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 -; GFX11-TRUE16-NEXT: .LBB46_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB92_2 +; GFX11-TRUE16-NEXT: .LBB92_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.l, 3 @@ -92581,7 +185021,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB92_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 @@ -92872,9 +185312,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 -; GFX11-FAKE16-NEXT: .LBB46_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB92_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB92_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3 @@ -92922,316 +185362,5883 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v106, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v110, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v109, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v108, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v106, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v110, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v109, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v108, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v94, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v54, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v52, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v53, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v50, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v49, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v49, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v48, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v32, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v39, 0x300, v1 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v33, v38, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v32, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v34, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v52, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 +; GFX11-FAKE16-NEXT: .LBB92_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:580 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v128i8_to_v64f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 +; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane +; SI-NEXT: s_mov_b32 s10, s16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v61, s29, 0 +; SI-NEXT: v_writelane_b32 v61, s28, 1 +; SI-NEXT: v_writelane_b32 v61, s27, 2 +; SI-NEXT: s_mov_b32 s61, s21 +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_writelane_b32 v63, s36, 4 +; SI-NEXT: v_writelane_b32 v63, s37, 5 +; SI-NEXT: v_writelane_b32 v63, s38, 6 +; SI-NEXT: v_writelane_b32 v63, s39, 7 +; SI-NEXT: v_writelane_b32 v63, s48, 8 +; SI-NEXT: v_writelane_b32 v63, s49, 9 +; SI-NEXT: v_writelane_b32 v63, s50, 10 +; SI-NEXT: v_writelane_b32 v63, s51, 11 +; SI-NEXT: v_writelane_b32 v63, s52, 12 +; SI-NEXT: v_writelane_b32 v63, s53, 13 +; SI-NEXT: v_writelane_b32 v63, s54, 14 +; SI-NEXT: v_writelane_b32 v63, s55, 15 +; SI-NEXT: v_writelane_b32 v63, s64, 16 +; SI-NEXT: v_writelane_b32 v63, s65, 17 +; SI-NEXT: v_writelane_b32 v63, s66, 18 +; SI-NEXT: v_writelane_b32 v63, s67, 19 +; SI-NEXT: v_writelane_b32 v63, s68, 20 +; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: s_mov_b32 s67, s19 +; SI-NEXT: s_mov_b32 s54, s17 +; SI-NEXT: s_mov_b32 s35, s23 +; SI-NEXT: s_mov_b32 s39, s26 +; SI-NEXT: s_mov_b32 s62, s25 +; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s99, v1 +; SI-NEXT: v_readfirstlane_b32 s74, v24 +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s6, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v62, s74, 0 +; SI-NEXT: v_readfirstlane_b32 s12, v26 +; SI-NEXT: v_writelane_b32 v62, s6, 1 +; SI-NEXT: v_readfirstlane_b32 s14, v25 +; SI-NEXT: v_writelane_b32 v62, s12, 2 +; SI-NEXT: v_readfirstlane_b32 s46, v28 +; SI-NEXT: v_writelane_b32 v62, s14, 3 +; SI-NEXT: v_readfirstlane_b32 s56, v27 +; SI-NEXT: v_writelane_b32 v62, s46, 4 +; SI-NEXT: v_readfirstlane_b32 s57, v30 +; SI-NEXT: v_writelane_b32 v62, s56, 5 +; SI-NEXT: v_readfirstlane_b32 s59, v29 +; SI-NEXT: v_writelane_b32 v62, s57, 6 +; SI-NEXT: v_writelane_b32 v62, s59, 7 +; SI-NEXT: s_mov_b32 s60, s20 +; SI-NEXT: s_mov_b32 s63, s24 +; SI-NEXT: v_readfirstlane_b32 s95, v3 +; SI-NEXT: v_readfirstlane_b32 s31, v5 +; SI-NEXT: v_readfirstlane_b32 s24, v9 +; SI-NEXT: v_readfirstlane_b32 s38, v12 +; SI-NEXT: v_readfirstlane_b32 s36, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s27, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v16 +; SI-NEXT: v_readfirstlane_b32 s79, v15 +; SI-NEXT: v_readfirstlane_b32 s13, v18 +; SI-NEXT: v_readfirstlane_b32 s15, v17 +; SI-NEXT: v_readfirstlane_b32 s42, v20 +; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_readfirstlane_b32 s44, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328 +; SI-NEXT: v_writelane_b32 v61, s4, 3 +; SI-NEXT: v_readfirstlane_b32 s45, v21 +; SI-NEXT: v_readfirstlane_b32 s98, v10 +; SI-NEXT: v_readfirstlane_b32 s90, v8 +; SI-NEXT: v_readfirstlane_b32 s88, v7 +; SI-NEXT: v_readfirstlane_b32 s91, v6 +; SI-NEXT: v_readfirstlane_b32 s93, v4 +; SI-NEXT: v_readfirstlane_b32 s55, v2 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 +; SI-NEXT: v_writelane_b32 v61, s4, 4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320 +; SI-NEXT: v_writelane_b32 v61, s4, 5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 +; SI-NEXT: v_writelane_b32 v61, s4, 6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 +; SI-NEXT: v_writelane_b32 v61, s4, 7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:308 +; SI-NEXT: v_writelane_b32 v61, s4, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:304 +; SI-NEXT: v_writelane_b32 v61, s4, 9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 +; SI-NEXT: v_writelane_b32 v61, s4, 10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 +; SI-NEXT: v_writelane_b32 v61, s4, 11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 +; SI-NEXT: v_writelane_b32 v61, s4, 12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 +; SI-NEXT: v_writelane_b32 v61, s4, 13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 +; SI-NEXT: v_writelane_b32 v61, s4, 14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:280 +; SI-NEXT: v_writelane_b32 v61, s4, 15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 +; SI-NEXT: v_writelane_b32 v61, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 +; SI-NEXT: v_writelane_b32 v61, s4, 17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 +; SI-NEXT: v_writelane_b32 v61, s4, 18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 +; SI-NEXT: v_writelane_b32 v61, s4, 19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 +; SI-NEXT: v_writelane_b32 v61, s4, 20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 +; SI-NEXT: v_writelane_b32 v61, s4, 21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 +; SI-NEXT: v_writelane_b32 v61, s4, 22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 +; SI-NEXT: v_writelane_b32 v61, s4, 23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:244 +; SI-NEXT: v_writelane_b32 v61, s4, 24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 +; SI-NEXT: v_writelane_b32 v61, s4, 25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:236 +; SI-NEXT: v_writelane_b32 v61, s4, 26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 +; SI-NEXT: v_writelane_b32 v61, s4, 27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:228 +; SI-NEXT: v_writelane_b32 v61, s4, 28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 +; SI-NEXT: v_writelane_b32 v61, s4, 29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 +; SI-NEXT: v_writelane_b32 v61, s4, 30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 +; SI-NEXT: v_writelane_b32 v61, s4, 31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 +; SI-NEXT: v_writelane_b32 v61, s4, 32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s16, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 +; SI-NEXT: v_writelane_b32 v61, s4, 33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s89, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 +; SI-NEXT: v_writelane_b32 v61, s4, 34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s73, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188 +; SI-NEXT: v_writelane_b32 v61, s4, 35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s72, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s40, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s21, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s85, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s81, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s97, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:164 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s7, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s11, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s41, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s47, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s58, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s76, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s29, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: v_writelane_b32 v61, s4, 36 +; SI-NEXT: v_writelane_b32 v61, s54, 37 +; SI-NEXT: v_writelane_b32 v61, s10, 38 +; SI-NEXT: v_writelane_b32 v61, s67, 39 +; SI-NEXT: v_writelane_b32 v61, s18, 40 +; SI-NEXT: v_writelane_b32 v61, s61, 41 +; SI-NEXT: v_writelane_b32 v61, s60, 42 +; SI-NEXT: v_writelane_b32 v61, s35, 43 +; SI-NEXT: v_writelane_b32 v61, s22, 44 +; SI-NEXT: v_writelane_b32 v61, s62, 45 +; SI-NEXT: v_writelane_b32 v61, s63, 46 +; SI-NEXT: v_writelane_b32 v61, s39, 47 +; SI-NEXT: v_writelane_b32 v61, s99, 48 +; SI-NEXT: v_writelane_b32 v61, s95, 49 +; SI-NEXT: v_writelane_b32 v61, s31, 50 +; SI-NEXT: v_writelane_b32 v61, s24, 51 +; SI-NEXT: v_writelane_b32 v61, s38, 52 +; SI-NEXT: v_writelane_b32 v61, s36, 53 +; SI-NEXT: v_writelane_b32 v61, s8, 54 +; SI-NEXT: v_writelane_b32 v61, s27, 55 +; SI-NEXT: v_writelane_b32 v61, s9, 56 +; SI-NEXT: v_writelane_b32 v61, s79, 57 +; SI-NEXT: v_writelane_b32 v61, s13, 58 +; SI-NEXT: v_writelane_b32 v61, s15, 59 +; SI-NEXT: v_writelane_b32 v61, s42, 60 +; SI-NEXT: v_writelane_b32 v61, s43, 61 +; SI-NEXT: v_writelane_b32 v61, s44, 62 +; SI-NEXT: v_writelane_b32 v61, s45, 63 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s37, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s50, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s48, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s19, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s64, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s17, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s65, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s71, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s70, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s83, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s49, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s80, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s82, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s87, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s84, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s51, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s86, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s94, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s96, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s68, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s34, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s77, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s66, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s78, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s53, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s69, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s52, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s75, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s23, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s28, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s26, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s25, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: v_writelane_b32 v62, s25, 8 +; SI-NEXT: v_writelane_b32 v62, s28, 9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s92, v31 +; SI-NEXT: v_writelane_b32 v62, s92, 10 +; SI-NEXT: v_writelane_b32 v62, s75, 11 +; SI-NEXT: v_writelane_b32 v62, s26, 12 +; SI-NEXT: v_writelane_b32 v62, s30, 13 +; SI-NEXT: v_writelane_b32 v62, s23, 14 +; SI-NEXT: v_writelane_b32 v62, s52, 15 +; SI-NEXT: v_writelane_b32 v62, s64, 16 +; SI-NEXT: v_writelane_b32 v62, s17, 17 +; SI-NEXT: v_writelane_b32 v62, s65, 18 +; SI-NEXT: v_writelane_b32 v62, s70, 19 +; SI-NEXT: v_writelane_b32 v62, s71, 20 +; SI-NEXT: v_writelane_b32 v62, s49, 21 +; SI-NEXT: v_writelane_b32 v62, s83, 22 +; SI-NEXT: v_writelane_b32 v62, s80, 23 +; SI-NEXT: v_writelane_b32 v62, s82, 24 +; SI-NEXT: v_writelane_b32 v62, s84, 25 +; SI-NEXT: v_writelane_b32 v62, s87, 26 +; SI-NEXT: v_writelane_b32 v62, s86, 27 +; SI-NEXT: v_writelane_b32 v62, s51, 28 +; SI-NEXT: v_writelane_b32 v62, s96, 29 +; SI-NEXT: v_writelane_b32 v62, s34, 30 +; SI-NEXT: v_writelane_b32 v62, s94, 31 +; SI-NEXT: v_writelane_b32 v62, s53, 32 +; SI-NEXT: v_writelane_b32 v62, s66, 33 +; SI-NEXT: v_writelane_b32 v62, s68, 34 +; SI-NEXT: v_writelane_b32 v62, s69, 35 +; SI-NEXT: v_writelane_b32 v62, s77, 36 +; SI-NEXT: v_writelane_b32 v62, s78, 37 +; SI-NEXT: s_cbranch_scc0 .LBB93_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s54, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s67, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s60, 0xff +; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s35, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s63, 0xff +; SI-NEXT: s_lshl_b32 s5, s62, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_readlane_b32 s5, v61, 2 +; SI-NEXT: s_and_b32 s4, s39, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 1 +; SI-NEXT: v_readlane_b32 s5, v61, 0 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s99, 0xff +; SI-NEXT: s_lshl_b32 s5, s55, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s95, 0xff +; SI-NEXT: s_lshl_b32 s5, s93, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s31, 0xff +; SI-NEXT: s_lshl_b32 s5, s91, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s98, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s36, 0xff +; SI-NEXT: s_lshl_b32 s5, s38, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s27, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s79, 0xff +; SI-NEXT: s_lshl_b32 s5, s9, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: s_lshl_b32 s5, s13, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s74, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s5, s12, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_and_b32 s4, s56, 0xff +; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_and_b32 s4, s59, 0xff +; SI-NEXT: s_lshl_b32 s5, s57, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_and_b32 s4, s92, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s28, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_and_b32 s4, s52, 0xff +; SI-NEXT: s_lshl_b32 s5, s30, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_and_b32 s4, s69, 0xff +; SI-NEXT: s_lshl_b32 s5, s53, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_and_b32 s4, s78, 0xff +; SI-NEXT: s_lshl_b32 s5, s66, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_and_b32 s4, s77, 0xff +; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_and_b32 s4, s68, 0xff +; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_and_b32 s4, s94, 0xff +; SI-NEXT: s_lshl_b32 s5, s86, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_and_b32 s4, s51, 0xff +; SI-NEXT: s_lshl_b32 s5, s84, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_and_b32 s4, s87, 0xff +; SI-NEXT: s_lshl_b32 s5, s82, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_and_b32 s4, s80, 0xff +; SI-NEXT: s_lshl_b32 s5, s49, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_and_b32 s4, s83, 0xff +; SI-NEXT: s_lshl_b32 s5, s70, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_and_b32 s4, s71, 0xff +; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s64, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_and_b32 s4, s50, 0xff +; SI-NEXT: s_lshl_b32 s5, s37, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s8, v61, 36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_and_b32 s4, s76, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_lshl_b32 s5, s41, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_and_b32 s4, s97, 0xff +; SI-NEXT: s_lshl_b32 s5, s81, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_and_b32 s4, s85, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s69, v61, 35 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_and_b32 s4, s69, 0xff +; SI-NEXT: s_lshl_b32 s5, s73, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s68, v61, 34 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_and_b32 s4, s68, 0xff +; SI-NEXT: s_lshl_b32 s5, s89, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s66, v61, 33 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: s_and_b32 s4, s66, 0xff +; SI-NEXT: s_lshl_b32 s5, s16, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s53, v61, 32 +; SI-NEXT: v_readlane_b32 s94, v61, 31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: s_and_b32 s4, s53, 0xff +; SI-NEXT: s_lshl_b32 s5, s94, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s34, v61, 30 +; SI-NEXT: v_readlane_b32 s96, v61, 29 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: s_and_b32 s4, s34, 0xff +; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s51, v61, 28 +; SI-NEXT: v_readlane_b32 s86, v61, 27 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_and_b32 s4, s51, 0xff +; SI-NEXT: s_lshl_b32 s5, s86, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s87, v61, 26 +; SI-NEXT: v_readlane_b32 s84, v61, 25 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 +; SI-NEXT: s_and_b32 s4, s87, 0xff +; SI-NEXT: s_lshl_b32 s5, s84, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s82, v61, 24 +; SI-NEXT: v_readlane_b32 s80, v61, 23 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: s_and_b32 s4, s82, 0xff +; SI-NEXT: s_lshl_b32 s5, s80, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s83, v61, 22 +; SI-NEXT: v_readlane_b32 s49, v61, 21 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: s_and_b32 s4, s83, 0xff +; SI-NEXT: s_lshl_b32 s5, s49, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s71, v61, 20 +; SI-NEXT: v_readlane_b32 s70, v61, 19 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_and_b32 s4, s71, 0xff +; SI-NEXT: s_lshl_b32 s5, s70, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s65, v61, 18 +; SI-NEXT: v_readlane_b32 s54, v61, 17 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: s_and_b32 s4, s65, 0xff +; SI-NEXT: s_lshl_b32 s5, s54, 8 +; SI-NEXT: s_mov_b32 s17, s19 +; SI-NEXT: s_mov_b32 s19, s50 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s67, v61, 16 +; SI-NEXT: v_readlane_b32 s50, v61, 15 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: s_and_b32 s4, s67, 0xff +; SI-NEXT: s_lshl_b32 s5, s50, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s64, v61, 14 +; SI-NEXT: v_readlane_b32 s52, v61, 13 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: s_and_b32 s4, s64, 0xff +; SI-NEXT: s_lshl_b32 s5, s52, 8 +; SI-NEXT: s_mov_b32 s23, s48 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s35, v61, 12 +; SI-NEXT: v_readlane_b32 s48, v61, 11 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_and_b32 s4, s35, 0xff +; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s30, v61, 10 +; SI-NEXT: v_readlane_b32 s39, v61, 9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s30, 0xff +; SI-NEXT: s_lshl_b32 s5, s39, 8 +; SI-NEXT: s_mov_b32 s26, s37 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s37, v61, 8 +; SI-NEXT: v_readlane_b32 s75, v61, 7 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: s_and_b32 s4, s37, 0xff +; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s92, v61, 6 +; SI-NEXT: v_readlane_b32 s77, v61, 5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s92, 0xff +; SI-NEXT: s_lshl_b32 s5, s77, 8 +; SI-NEXT: s_mov_b32 s28, s29 +; SI-NEXT: s_mov_b32 s29, s76 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s78, v61, 4 +; SI-NEXT: v_readlane_b32 s76, v61, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s78, 0xff +; SI-NEXT: s_lshl_b32 s5, s76, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_mov_b32 s99, s55 +; SI-NEXT: s_mov_b32 s20, s88 +; SI-NEXT: s_mov_b32 s24, s98 +; SI-NEXT: s_mov_b32 s59, s58 +; SI-NEXT: s_mov_b32 s56, s47 +; SI-NEXT: s_mov_b32 s46, s41 +; SI-NEXT: s_mov_b32 s12, s11 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s97 +; SI-NEXT: s_mov_b32 s97, s81 +; SI-NEXT: s_mov_b32 s81, s85 +; SI-NEXT: s_mov_b32 s6, s40 +; SI-NEXT: s_mov_b32 s40, s72 +; SI-NEXT: s_mov_b32 s45, s73 +; SI-NEXT: s_mov_b32 s15, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_mov_b32 s55, s93 +; SI-NEXT: s_mov_b32 s95, s91 +; SI-NEXT: s_mov_b32 s31, s90 +; SI-NEXT: s_cbranch_execnz .LBB93_3 +; SI-NEXT: .LBB93_2: ; %cmp.true +; SI-NEXT: s_add_i32 s4, s78, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s76, 8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s5, s92, 3 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: s_lshl_b32 vcc_lo, s77, 8 +; SI-NEXT: s_or_b32 s5, vcc_lo, s5 +; SI-NEXT: s_add_i32 vcc_lo, s37, 3 +; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff +; SI-NEXT: s_lshl_b32 vcc_hi, s75, 8 +; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo +; SI-NEXT: s_add_i32 vcc_hi, s30, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s60, s39, 8 +; SI-NEXT: s_or_b32 s60, s60, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s35, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s61, s48, 8 +; SI-NEXT: s_or_b32 s61, s61, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s64, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s62, s52, 8 +; SI-NEXT: s_or_b32 s62, s62, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s67, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s63, s50, 8 +; SI-NEXT: s_or_b32 s10, s63, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s65, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s72, s54, 8 +; SI-NEXT: s_or_b32 s72, s72, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s71, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s73, s70, 8 +; SI-NEXT: s_or_b32 s73, s73, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s83, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s74, s49, 8 +; SI-NEXT: s_or_b32 s74, s74, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s82, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s75, s80, 8 +; SI-NEXT: s_or_b32 s75, s75, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s87, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s76, s84, 8 +; SI-NEXT: s_or_b32 s76, s76, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s51, 3 +; SI-NEXT: s_add_i32 s93, s53, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s77, s86, 8 +; SI-NEXT: s_add_i32 s89, s34, 3 +; SI-NEXT: s_and_b32 s93, s93, 0xff +; SI-NEXT: s_lshl_b32 s78, s94, 8 +; SI-NEXT: s_add_i32 s34, s66, 3 +; SI-NEXT: s_or_b32 s77, s77, vcc_hi +; SI-NEXT: s_and_b32 s89, s89, 0xff +; SI-NEXT: s_lshl_b32 vcc_hi, s96, 8 +; SI-NEXT: s_or_b32 s22, s78, s93 +; SI-NEXT: s_and_b32 s93, s34, 0xff +; SI-NEXT: s_lshl_b32 s92, s16, 8 +; SI-NEXT: s_add_i32 s53, s68, 3 +; SI-NEXT: s_or_b32 s89, vcc_hi, s89 +; SI-NEXT: s_or_b32 s92, s92, s93 +; SI-NEXT: s_and_b32 s93, s53, 0xff +; SI-NEXT: s_lshl_b32 vcc_hi, s15, 8 +; SI-NEXT: s_add_i32 s66, s69, 3 +; SI-NEXT: s_or_b32 s93, vcc_hi, s93 +; SI-NEXT: s_and_b32 vcc_hi, s66, 0xff +; SI-NEXT: s_lshl_b32 s34, s45, 8 +; SI-NEXT: s_add_i32 s68, s6, 3 +; SI-NEXT: s_or_b32 vcc_hi, s34, vcc_hi +; SI-NEXT: s_and_b32 s34, s68, 0xff +; SI-NEXT: s_lshl_b32 s39, s40, 8 +; SI-NEXT: s_add_i32 s69, s81, 3 +; SI-NEXT: s_or_b32 s34, s39, s34 +; SI-NEXT: s_and_b32 s39, s69, 0xff +; SI-NEXT: s_lshl_b32 s52, s21, 8 +; SI-NEXT: s_add_i32 s81, s7, 3 +; SI-NEXT: s_or_b32 s39, s52, s39 +; SI-NEXT: s_and_b32 s52, s81, 0xff +; SI-NEXT: s_lshl_b32 s53, s97, 8 +; SI-NEXT: s_add_i32 s85, s12, 3 +; SI-NEXT: s_or_b32 s52, s53, s52 +; SI-NEXT: s_and_b32 s53, s85, 0xff +; SI-NEXT: s_lshl_b32 s64, s11, 8 +; SI-NEXT: s_add_i32 s97, s56, 3 +; SI-NEXT: s_or_b32 s53, s64, s53 +; SI-NEXT: s_and_b32 s64, s97, 0xff +; SI-NEXT: s_lshl_b32 s66, s46, 8 +; SI-NEXT: s_add_i32 s21, s29, 3 +; SI-NEXT: s_or_b32 s64, s66, s64 +; SI-NEXT: s_and_b32 s21, s21, 0xff +; SI-NEXT: s_lshl_b32 s66, s59, 8 +; SI-NEXT: s_add_i32 s25, s8, 3 +; SI-NEXT: s_or_b32 s66, s66, s21 +; SI-NEXT: s_and_b32 s21, s25, 0xff +; SI-NEXT: s_lshl_b32 s6, s28, 8 +; SI-NEXT: s_add_i32 s29, s19, 3 +; SI-NEXT: s_or_b32 s67, s6, s21 +; SI-NEXT: s_and_b32 s6, s29, 0xff +; SI-NEXT: s_lshl_b32 s18, s26, 8 +; SI-NEXT: s_add_i32 s28, s17, 3 +; SI-NEXT: s_or_b32 s68, s18, s6 +; SI-NEXT: s_and_b32 s6, s28, 0xff +; SI-NEXT: s_lshl_b32 s18, s23, 8 +; SI-NEXT: s_or_b32 s69, s18, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 17 +; SI-NEXT: s_add_i32 s7, s6, 3 +; SI-NEXT: v_readlane_b32 s16, v62, 15 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v62, 16 +; SI-NEXT: s_add_i32 s27, s16, 3 +; SI-NEXT: v_readlane_b32 s16, v62, 13 +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_lshl_b32 s23, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v62, 14 +; SI-NEXT: s_mov_b32 s91, s24 +; SI-NEXT: s_or_b32 s70, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 20 +; SI-NEXT: s_add_i32 s24, s16, 3 +; SI-NEXT: v_readlane_b32 s16, v62, 11 +; SI-NEXT: s_add_i32 s11, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 18 +; SI-NEXT: s_lshl_b32 s19, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v62, 12 +; SI-NEXT: s_mov_b32 s90, s20 +; SI-NEXT: s_and_b32 s6, s11, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s20, s16, 3 +; SI-NEXT: v_readlane_b32 s16, v62, 9 +; SI-NEXT: s_or_b32 s71, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 22 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s17, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v62, 10 +; SI-NEXT: s_add_i32 s12, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 19 +; SI-NEXT: s_or_b32 s17, s17, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s20, v62, 8 +; SI-NEXT: s_and_b32 s6, s12, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s20, s20, 8 +; SI-NEXT: s_or_b32 s81, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 23 +; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_or_b32 s16, s20, s16 +; SI-NEXT: v_readlane_b32 s20, v62, 7 +; SI-NEXT: s_add_i32 s14, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 21 +; SI-NEXT: s_or_b32 s19, s19, s24 +; SI-NEXT: s_add_i32 s98, s20, 3 +; SI-NEXT: v_readlane_b32 s24, v62, 6 +; SI-NEXT: s_and_b32 s6, s14, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s20, s98, 0xff +; SI-NEXT: s_lshl_b32 s24, s24, 8 +; SI-NEXT: s_or_b32 s83, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 26 +; SI-NEXT: s_and_b32 s27, s27, 0xff +; SI-NEXT: s_or_b32 s20, s24, s20 +; SI-NEXT: v_readlane_b32 s24, v62, 5 +; SI-NEXT: s_add_i32 s41, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 24 +; SI-NEXT: s_or_b32 s23, s23, s27 +; SI-NEXT: s_add_i32 s86, s24, 3 +; SI-NEXT: v_readlane_b32 s27, v62, 4 +; SI-NEXT: s_and_b32 s6, s41, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s24, s86, 0xff +; SI-NEXT: s_lshl_b32 s27, s27, 8 +; SI-NEXT: s_or_b32 s85, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 28 +; SI-NEXT: s_or_b32 s24, s27, s24 +; SI-NEXT: v_readlane_b32 s27, v62, 3 +; SI-NEXT: s_add_i32 s46, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 25 +; SI-NEXT: s_add_i32 s12, s73, 0x300 +; SI-NEXT: s_add_i32 s82, s27, 3 +; SI-NEXT: v_readlane_b32 s73, v62, 2 +; SI-NEXT: s_and_b32 s6, s46, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s27, s82, 0xff +; SI-NEXT: s_lshl_b32 s73, s73, 8 +; SI-NEXT: s_or_b32 s96, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 31 +; SI-NEXT: s_or_b32 s27, s73, s27 +; SI-NEXT: v_readlane_b32 s73, v62, 1 +; SI-NEXT: s_add_i32 s47, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 27 +; SI-NEXT: s_add_i32 s13, s74, 0x300 +; SI-NEXT: s_add_i32 s65, s73, 3 +; SI-NEXT: v_readlane_b32 s74, v62, 0 +; SI-NEXT: s_and_b32 s6, s47, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s73, s65, 0xff +; SI-NEXT: s_lshl_b32 s74, s74, 8 +; SI-NEXT: s_or_b32 s97, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 34 +; SI-NEXT: s_or_b32 s73, s74, s73 +; SI-NEXT: v_readlane_b32 s74, v61, 63 +; SI-NEXT: s_add_i32 s56, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 29 +; SI-NEXT: s_add_i32 s14, s75, 0x300 +; SI-NEXT: s_add_i32 s54, s74, 3 +; SI-NEXT: v_readlane_b32 s75, v61, 62 +; SI-NEXT: s_and_b32 s6, s56, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s74, s54, 0xff +; SI-NEXT: s_lshl_b32 s75, s75, 8 +; SI-NEXT: s_or_b32 s63, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 36 +; SI-NEXT: s_or_b32 s74, s75, s74 +; SI-NEXT: v_readlane_b32 s75, v61, 61 +; SI-NEXT: s_add_i32 s58, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 30 +; SI-NEXT: s_add_i32 s15, s76, 0x300 +; SI-NEXT: s_add_i32 s50, s75, 3 +; SI-NEXT: v_readlane_b32 s76, v61, 60 +; SI-NEXT: s_and_b32 s6, s58, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s75, s50, 0xff +; SI-NEXT: s_lshl_b32 s76, s76, 8 +; SI-NEXT: s_or_b32 s79, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 37 +; SI-NEXT: s_or_b32 s75, s76, s75 +; SI-NEXT: v_readlane_b32 s76, v61, 59 +; SI-NEXT: s_add_i32 s59, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 33 +; SI-NEXT: s_add_i32 s18, s77, 0x300 +; SI-NEXT: s_add_i32 s48, s76, 3 +; SI-NEXT: v_readlane_b32 s77, v61, 58 +; SI-NEXT: s_and_b32 s6, s59, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s76, s48, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 8 +; SI-NEXT: s_or_b32 s78, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 35 +; SI-NEXT: s_or_b32 s76, s77, s76 +; SI-NEXT: v_readlane_b32 s77, v61, 57 +; SI-NEXT: s_add_i32 s57, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 32 +; SI-NEXT: s_add_i32 s11, s72, 0x300 +; SI-NEXT: s_add_i32 s72, s79, 0x300 +; SI-NEXT: s_add_i32 s37, s77, 3 +; SI-NEXT: v_readlane_b32 s79, v61, 56 +; SI-NEXT: s_and_b32 s6, s57, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s77, s37, 0xff +; SI-NEXT: s_lshl_b32 s79, s79, 8 +; SI-NEXT: s_or_b32 s88, s7, s6 +; SI-NEXT: s_or_b32 s77, s79, s77 +; SI-NEXT: v_readlane_b32 s79, v61, 55 +; SI-NEXT: s_add_i32 s21, s89, 0x300 +; SI-NEXT: s_add_i32 s89, s88, 0x300 +; SI-NEXT: s_add_i32 s35, s79, 3 +; SI-NEXT: v_readlane_b32 s88, v61, 54 +; SI-NEXT: s_and_b32 s79, s35, 0xff +; SI-NEXT: s_lshl_b32 s88, s88, 8 +; SI-NEXT: s_or_b32 s79, s88, s79 +; SI-NEXT: v_readlane_b32 s88, v61, 53 +; SI-NEXT: s_add_i32 s25, s92, 0x300 +; SI-NEXT: s_add_i32 s30, s88, 3 +; SI-NEXT: v_readlane_b32 s92, v61, 52 +; SI-NEXT: s_and_b32 s88, s30, 0xff +; SI-NEXT: s_lshl_b32 s92, s92, 8 +; SI-NEXT: s_or_b32 s88, s92, s88 +; SI-NEXT: v_readlane_b32 s92, v61, 51 +; SI-NEXT: s_add_i32 s94, s92, 3 +; SI-NEXT: s_and_b32 s92, s94, 0xff +; SI-NEXT: s_lshl_b32 s91, s91, 8 +; SI-NEXT: s_add_i32 s90, s90, 3 +; SI-NEXT: s_or_b32 s91, s91, s92 +; SI-NEXT: s_and_b32 s90, s90, 0xff +; SI-NEXT: s_lshl_b32 s92, s31, 8 +; SI-NEXT: s_or_b32 s90, s92, s90 +; SI-NEXT: v_readlane_b32 s92, v61, 50 +; SI-NEXT: s_add_i32 s92, s92, 3 +; SI-NEXT: s_add_i32 s26, s93, 0x300 +; SI-NEXT: s_and_b32 s92, s92, 0xff +; SI-NEXT: s_lshl_b32 s93, s95, 8 +; SI-NEXT: s_or_b32 s92, s93, s92 +; SI-NEXT: v_readlane_b32 s93, v61, 49 +; SI-NEXT: s_add_i32 s93, s93, 3 +; SI-NEXT: s_and_b32 s93, s93, 0xff +; SI-NEXT: s_lshl_b32 s94, s55, 8 +; SI-NEXT: s_or_b32 s93, s94, s93 +; SI-NEXT: v_readlane_b32 s94, v61, 48 +; SI-NEXT: s_add_i32 s94, s94, 3 +; SI-NEXT: s_and_b32 s94, s94, 0xff +; SI-NEXT: s_lshl_b32 s95, s99, 8 +; SI-NEXT: s_or_b32 s94, s95, s94 +; SI-NEXT: v_readlane_b32 s95, v61, 1 +; SI-NEXT: s_add_i32 s95, s95, 3 +; SI-NEXT: v_readlane_b32 s30, v61, 0 +; SI-NEXT: s_add_i32 s6, vcc_lo, 0x300 +; SI-NEXT: s_and_b32 s95, s95, 0xff +; SI-NEXT: s_lshl_b32 vcc_lo, s30, 8 +; SI-NEXT: v_readlane_b32 s30, v61, 47 +; SI-NEXT: s_or_b32 s95, vcc_lo, s95 +; SI-NEXT: s_add_i32 vcc_lo, s30, 3 +; SI-NEXT: v_readlane_b32 s30, v61, 2 +; SI-NEXT: s_add_i32 s28, vcc_hi, 0x300 +; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff +; SI-NEXT: s_lshl_b32 vcc_hi, s30, 8 +; SI-NEXT: v_readlane_b32 s30, v61, 46 +; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo +; SI-NEXT: s_add_i32 vcc_hi, s30, 3 +; SI-NEXT: v_readlane_b32 s30, v61, 45 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s30, s30, 8 +; SI-NEXT: s_or_b32 vcc_hi, s30, vcc_hi +; SI-NEXT: v_readlane_b32 s30, v61, 44 +; SI-NEXT: s_add_i32 s30, s30, 3 +; SI-NEXT: v_readlane_b32 s31, v61, 43 +; SI-NEXT: s_and_b32 s30, s30, 0xff +; SI-NEXT: s_lshl_b32 s31, s31, 8 +; SI-NEXT: s_or_b32 s30, s31, s30 +; SI-NEXT: v_readlane_b32 s31, v61, 42 +; SI-NEXT: s_add_i32 s29, s34, 0x300 +; SI-NEXT: s_add_i32 s31, s31, 3 +; SI-NEXT: v_readlane_b32 s34, v61, 41 +; SI-NEXT: s_and_b32 s31, s31, 0xff +; SI-NEXT: s_lshl_b32 s34, s34, 8 +; SI-NEXT: s_or_b32 s31, s34, s31 +; SI-NEXT: s_addk_i32 s31, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s31 +; SI-NEXT: s_addk_i32 s30, 0x300 +; SI-NEXT: s_addk_i32 vcc_hi, 0x300 +; SI-NEXT: v_readlane_b32 s34, v61, 40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s30 +; SI-NEXT: s_add_i32 s34, s34, 3 +; SI-NEXT: v_readlane_b32 s35, v61, 39 +; SI-NEXT: s_and_b32 s34, s34, 0xff +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_hi +; SI-NEXT: s_lshl_b32 s35, s35, 8 +; SI-NEXT: s_addk_i32 vcc_lo, 0x300 +; SI-NEXT: s_or_b32 s34, s35, s34 +; SI-NEXT: v_readlane_b32 s35, v61, 38 +; SI-NEXT: s_add_i32 s35, s35, 3 +; SI-NEXT: v_readlane_b32 s36, v61, 37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_lo +; SI-NEXT: s_and_b32 s35, s35, 0xff +; SI-NEXT: s_lshl_b32 s36, s36, 8 +; SI-NEXT: s_or_b32 s35, s36, s35 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_add_i32 s7, s60, 0x300 +; SI-NEXT: s_add_i32 s8, s61, 0x300 +; SI-NEXT: s_add_i32 s9, s62, 0x300 +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_addk_i32 s22, 0x300 +; SI-NEXT: s_add_i32 s40, s39, 0x300 +; SI-NEXT: s_add_i32 s41, s52, 0x300 +; SI-NEXT: s_add_i32 s42, s53, 0x300 +; SI-NEXT: s_add_i32 s43, s64, 0x300 +; SI-NEXT: s_add_i32 s44, s66, 0x300 +; SI-NEXT: s_add_i32 s45, s67, 0x300 +; SI-NEXT: s_add_i32 s46, s68, 0x300 +; SI-NEXT: s_add_i32 s47, s69, 0x300 +; SI-NEXT: s_add_i32 s56, s70, 0x300 +; SI-NEXT: s_add_i32 s57, s71, 0x300 +; SI-NEXT: s_add_i32 s58, s81, 0x300 +; SI-NEXT: s_add_i32 s59, s83, 0x300 +; SI-NEXT: s_add_i32 s60, s85, 0x300 +; SI-NEXT: s_add_i32 s61, s96, 0x300 +; SI-NEXT: s_add_i32 s62, s97, 0x300 +; SI-NEXT: s_addk_i32 s63, 0x300 +; SI-NEXT: s_addk_i32 s78, 0x300 +; SI-NEXT: s_addk_i32 s23, 0x300 +; SI-NEXT: s_addk_i32 s19, 0x300 +; SI-NEXT: s_addk_i32 s17, 0x300 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_addk_i32 s24, 0x300 +; SI-NEXT: s_addk_i32 s27, 0x300 +; SI-NEXT: s_addk_i32 s73, 0x300 +; SI-NEXT: s_addk_i32 s74, 0x300 +; SI-NEXT: s_addk_i32 s75, 0x300 +; SI-NEXT: s_addk_i32 s76, 0x300 +; SI-NEXT: s_addk_i32 s77, 0x300 +; SI-NEXT: s_addk_i32 s79, 0x300 +; SI-NEXT: s_addk_i32 s88, 0x300 +; SI-NEXT: s_addk_i32 s91, 0x300 +; SI-NEXT: s_addk_i32 s90, 0x300 +; SI-NEXT: s_addk_i32 s92, 0x300 +; SI-NEXT: s_addk_i32 s93, 0x300 +; SI-NEXT: s_addk_i32 s94, 0x300 +; SI-NEXT: s_addk_i32 s95, 0x300 +; SI-NEXT: s_addk_i32 s34, 0x300 +; SI-NEXT: s_addk_i32 s35, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s35 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, s95 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: .LBB93_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 +; SI-NEXT: v_add_i32_e32 v7, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 +; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 +; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v32 +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v56 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x74, v0 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB93_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_mov_b32 s17, s19 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_mov_b32 s19, s50 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_mov_b32 s23, s48 +; SI-NEXT: s_mov_b32 s26, s37 +; SI-NEXT: s_mov_b32 s28, s29 +; SI-NEXT: s_mov_b32 s29, s76 +; SI-NEXT: s_mov_b32 s59, s58 +; SI-NEXT: s_mov_b32 s56, s47 +; SI-NEXT: s_mov_b32 s46, s41 +; SI-NEXT: s_mov_b32 s12, s11 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s97 +; SI-NEXT: s_mov_b32 s97, s81 +; SI-NEXT: s_mov_b32 s81, s85 +; SI-NEXT: s_mov_b32 s6, s40 +; SI-NEXT: s_mov_b32 s40, s72 +; SI-NEXT: s_mov_b32 s45, s73 +; SI-NEXT: s_mov_b32 s15, s89 +; SI-NEXT: s_mov_b32 s24, s98 +; SI-NEXT: s_mov_b32 s20, s88 +; SI-NEXT: s_mov_b32 s99, s55 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_readlane_b32 s75, v61, 7 +; SI-NEXT: v_readlane_b32 s76, v61, 3 +; SI-NEXT: v_readlane_b32 s77, v61, 5 +; SI-NEXT: v_readlane_b32 s78, v61, 4 +; SI-NEXT: v_readlane_b32 s92, v61, 6 +; SI-NEXT: v_readlane_b32 s39, v61, 9 +; SI-NEXT: v_readlane_b32 s37, v61, 8 +; SI-NEXT: v_readlane_b32 s30, v61, 10 +; SI-NEXT: v_readlane_b32 s48, v61, 11 +; SI-NEXT: v_readlane_b32 s52, v61, 13 +; SI-NEXT: v_readlane_b32 s35, v61, 12 +; SI-NEXT: v_readlane_b32 s50, v61, 15 +; SI-NEXT: v_readlane_b32 s64, v61, 14 +; SI-NEXT: v_readlane_b32 s54, v61, 17 +; SI-NEXT: v_readlane_b32 s67, v61, 16 +; SI-NEXT: v_readlane_b32 s65, v61, 18 +; SI-NEXT: v_readlane_b32 s70, v61, 19 +; SI-NEXT: v_readlane_b32 s49, v61, 21 +; SI-NEXT: v_readlane_b32 s71, v61, 20 +; SI-NEXT: v_readlane_b32 s80, v61, 23 +; SI-NEXT: v_readlane_b32 s83, v61, 22 +; SI-NEXT: v_readlane_b32 s84, v61, 25 +; SI-NEXT: v_readlane_b32 s82, v61, 24 +; SI-NEXT: v_readlane_b32 s87, v61, 26 +; SI-NEXT: v_readlane_b32 s86, v61, 27 +; SI-NEXT: v_readlane_b32 s96, v61, 29 +; SI-NEXT: v_readlane_b32 s51, v61, 28 +; SI-NEXT: s_mov_b32 s55, s93 +; SI-NEXT: s_mov_b32 s95, s91 +; SI-NEXT: v_readlane_b32 s94, v61, 31 +; SI-NEXT: s_mov_b32 s31, s90 +; SI-NEXT: v_readlane_b32 s34, v61, 30 +; SI-NEXT: v_readlane_b32 s53, v61, 32 +; SI-NEXT: v_readlane_b32 s66, v61, 33 +; SI-NEXT: v_readlane_b32 s68, v61, 34 +; SI-NEXT: v_readlane_b32 s69, v61, 35 +; SI-NEXT: v_readlane_b32 s8, v61, 36 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_branch .LBB93_2 +; +; VI-LABEL: bitcast_v128i8_to_v64f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v46, v16 +; VI-NEXT: v_mov_b32_e32 v60, v5 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:56 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_mov_b32_e32 v62, v21 +; VI-NEXT: v_mov_b32_e32 v47, v17 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v18 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:88 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v19 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:204 +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v22 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v2 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:240 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:220 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:252 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:268 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:284 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:280 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:312 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:296 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:264 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB93_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v60 +; VI-NEXT: v_mov_b32_e32 v28, v26 +; VI-NEXT: v_mov_b32_e32 v26, v23 +; VI-NEXT: v_mov_b32_e32 v23, v5 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v31, v22 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v59, v10 +; VI-NEXT: v_mov_b32_e32 v58, v43 +; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_mov_b32_e32 v27, v14 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v3, v38 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v62 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v2, v63 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v33, v36 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v15 +; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v49 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v48, v32 +; VI-NEXT: v_mov_b32_e32 v34, v40 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v57 +; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v25, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v60 +; VI-NEXT: v_mov_b32_e32 v57, v61 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v61, v38 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v56 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v56, v45 +; VI-NEXT: v_mov_b32_e32 v51, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v53 +; VI-NEXT: v_mov_b32_e32 v55, v63 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v42, v44 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v46 +; VI-NEXT: v_mov_b32_e32 v39, v41 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v41, v52 +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v63, v54 +; VI-NEXT: v_mov_b32_e32 v54, v49 +; VI-NEXT: v_mov_b32_e32 v49, v53 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB93_3 +; VI-NEXT: .LBB93_2: +; VI-NEXT: v_mov_b32_e32 v34, v40 +; VI-NEXT: v_mov_b32_e32 v57, v61 +; VI-NEXT: v_mov_b32_e32 v48, v32 +; VI-NEXT: v_mov_b32_e32 v56, v45 +; VI-NEXT: v_mov_b32_e32 v51, v42 +; VI-NEXT: v_mov_b32_e32 v39, v41 +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v33, v36 +; VI-NEXT: v_mov_b32_e32 v36, v49 +; VI-NEXT: v_mov_b32_e32 v35, v63 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v53 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: v_mov_b32_e32 v52, v38 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: .LBB93_3: ; %Flow +; VI-NEXT: v_mov_b32_e32 v38, v48 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB93_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v47 +; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v53, v34 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s25, 8 +; VI-NEXT: s_and_b32 s10, s24, 0xff +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_and_b32 s12, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 24 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_and_b32 s11, s22, 0xff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s12 +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v39 +; VI-NEXT: s_and_b32 s10, s26, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s8, 0xffff +; VI-NEXT: s_lshl_b32 s8, s11, 16 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: v_and_b32_e32 v39, 0xff, v39 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s8, s10, 16 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v31, 24, v63 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s9, 0xffff +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v25 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x300, v28 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x300, v21 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v20 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v19 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v23 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v15 +; VI-NEXT: v_lshlrev_b32_e32 v33, 24, v59 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x300, v13 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v12 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v11 +; VI-NEXT: v_add_u32_e32 v43, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v24 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x300, v26 +; VI-NEXT: v_and_b32_e32 v26, 0xff, v43 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_or_b32_e32 v26, v33, v26 +; VI-NEXT: v_or_b32_sdwa v26, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 +; VI-NEXT: v_add_u32_e32 v44, vcc, 3, v51 +; VI-NEXT: v_and_b32_e32 v27, 0xff, v44 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v9 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v46 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x300, v7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x300, v6 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x300, v5 +; VI-NEXT: v_lshlrev_b32_e32 v32, 24, v49 +; VI-NEXT: v_or_b32_e32 v27, v32, v27 +; VI-NEXT: v_or_b32_sdwa v27, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v61 +; VI-NEXT: v_or_b32_e32 v4, v4, v29 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v29, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_e32 v3, s4, v4 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v29 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v5, v29, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v6, v29, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v8, v29, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v9, v29, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v10, v29, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v35 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_or_b32_sdwa v11, v29, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v52 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v12, v29, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v35, 24, v56 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v37 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v41 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v14, v29, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v2 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v62 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v55 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v36 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v29, v29, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v29 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v18, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v53 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v19, v29, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v57 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_or_b32_sdwa v20, v29, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v38 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v21, v29, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v22 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v55, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v54 +; VI-NEXT: v_lshlrev_b32_e32 v54, 24, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v55 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v40, 24, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v30, 24, v28 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v41, vcc, 3, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 +; VI-NEXT: v_and_b32_e32 v41, 0xff, v41 +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 +; VI-NEXT: v_or_b32_e32 v55, v40, v55 +; VI-NEXT: v_or_b32_sdwa v23, v55, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v42, vcc, 3, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v42, 0xff, v42 +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v42 +; VI-NEXT: v_or_b32_e32 v54, v54, v55 +; VI-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v53, vcc, 3, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v53, 0xff, v53 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_or_b32_e32 v35, v35, v53 +; VI-NEXT: v_or_b32_sdwa v25, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v52, vcc, 3, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v52, 0xff, v52 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; VI-NEXT: v_or_b32_e32 v28, v28, v32 +; VI-NEXT: v_or_b32_sdwa v28, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v51, vcc, 3, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v51, 0xff, v51 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; VI-NEXT: v_or_b32_e32 v29, v29, v32 +; VI-NEXT: v_or_b32_sdwa v29, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v49, vcc, 3, v34 +; VI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; VI-NEXT: v_or_b32_e32 v30, v30, v32 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v39 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_e32 v31, v31, v32 +; VI-NEXT: v_or_b32_sdwa v30, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 +; VI-NEXT: .LBB93_5: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v128i8_to_v64f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v44 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(36) +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB93_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v39, v16 +; GFX9-NEXT: v_or_b32_sdwa v17, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v42, v61 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v55, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v50, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v49, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_mov_b32_e32 v46, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v35, v45 +; GFX9-NEXT: v_mov_b32_e32 v45, v61 +; GFX9-NEXT: v_mov_b32_e32 v61, v42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v38, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v54, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB93_3 +; GFX9-NEXT: .LBB93_2: +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v33, v45 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: .LBB93_3: ; %Flow +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB93_5 +; GFX9-NEXT: ; %bb.4: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_lshl_b32 s6, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshl_b32 s9, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_lshl_b32 s10, s19, 8 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v25, v37, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v37, v51, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v38, v38, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s16, 0xff +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_and_b32 s9, s18, 0xff +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v39, v36, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v51, v34, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v2 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v52, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 +; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v56 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v23, v41, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v41, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v22 +; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v52 +; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v42, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v45, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v23 +; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v25 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 +; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v38 +; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v50 +; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v39 +; GFX9-NEXT: v_add_u32_e32 v39, 0x300, v49 +; GFX9-NEXT: v_add_u32_e32 v49, 0x300, v53 +; GFX9-NEXT: v_add_u32_e32 v50, 0x300, v55 +; GFX9-NEXT: v_add_u32_e32 v53, 0x300, v45 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v37, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v36, 16, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 +; GFX9-NEXT: .LBB93_5: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:436 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:192 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64 +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB93_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v2, 0xffff, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v68 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v3, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v83 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v98 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v148 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v20, 16, v21 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v147 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v149 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v151 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v161 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v163 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v165 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v167 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v44 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v56 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v59 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v63 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v77 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v75 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v78 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v79 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v91 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v92 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB93_3 +; GFX11-TRUE16-NEXT: .LBB93_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v92 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v88 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v78 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v77 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v76 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v75 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v74 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v72 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v63 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v61 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v59 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v59, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v60, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v57 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v58 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v57, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v56 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v56, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v45 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v46 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v45, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v44 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v43 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v41 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v42 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v40 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v183 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v180 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v177 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v167 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v167, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v176, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v165 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v166 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v165, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v163 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v161 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v162 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v160 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v151 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v149 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v148 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v98 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v55, 3, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v97 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v86 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v81 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v69 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v70, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v69, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v64, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 3, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v53, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v49, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v50 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v38, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v36, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v37, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v34, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v53, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v49, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v69 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v165, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v161, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v15, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v14, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v59, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v35 +; GFX11-TRUE16-NEXT: .LBB93_3: ; %end +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:436 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB93_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB93_2 +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:320 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:240 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB93_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v5, 0xffff, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v4, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB93_3 +; GFX11-FAKE16-NEXT: .LBB93_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v58 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v47 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v43 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v94, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v40 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v90, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v182 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v89, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v88, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v78, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v54, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v52, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v53, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v50, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v54 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v53 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v49, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v49, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v48, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v34, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v39 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 3, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v101, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v102, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v87, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v86, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v84, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v83, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v32, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v39, 0x300, v1 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v33, v38, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v32, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v34, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v52, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 -; GFX11-FAKE16-NEXT: .LBB46_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:580 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v82, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v81, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v80, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 3, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v69, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v112, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v68, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v67, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v65, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v64, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 +; GFX11-FAKE16-NEXT: .LBB93_3: ; %end +; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:440 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB93_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB93_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -93250,2056 +191257,1992 @@ end: } define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v64f16_to_v128i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v11 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v16 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v20 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v24 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v23 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v25 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v28 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v27 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v29 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v44 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v37 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v36 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v62 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v53 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v60 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v61 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v58 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v59 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v56 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v57 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v43 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v42 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v41 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v40 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v55 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v54 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v52 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v51 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v50 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v38 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v35 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v8 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v46 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v47 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v14 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v16 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB47_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; GCN-NEXT: v_or_b32_e32 v61, v32, v14 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v35 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v15, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v55, v5, v6 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v59 -; GCN-NEXT: v_or_b32_e32 v43, v7, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v63 -; GCN-NEXT: v_or_b32_e32 v41, v9, v5 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v23 -; GCN-NEXT: v_or_b32_e32 v47, v10, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v40, v2, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; GCN-NEXT: v_or_b32_e32 v58, v11, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_or_b32_e32 v54, v4, v1 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_or_b32_e32 v60, v12, v1 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v24 -; GCN-NEXT: v_or_b32_e32 v44, v13, v2 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v5, v1 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v34 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v45, v6, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v7, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v37 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v46, v9, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v10, v1 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v48 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v56, v11, v5 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v49 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v12, v3 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v50 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v57, v13, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v33, v7 -; GCN-NEXT: v_bfe_u32 v7, v35, 8, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v7, v4 -; GCN-NEXT: v_bfe_u32 v4, v59, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v59, v4, v9 -; GCN-NEXT: v_bfe_u32 v4, v23, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v4, v1 -; GCN-NEXT: v_bfe_u32 v1, v32, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v1, v10 -; GCN-NEXT: v_bfe_u32 v1, v31, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v1, v5 -; GCN-NEXT: v_bfe_u32 v1, v22, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v1, v11 -; GCN-NEXT: v_bfe_u32 v1, v16, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v1, v3 -; GCN-NEXT: v_bfe_u32 v1, v15, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v15, v8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v21, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v1, v6 -; GCN-NEXT: v_bfe_u32 v1, v20, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v16, v38, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v18, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v29, v2 -; GCN-NEXT: v_bfe_u32 v2, v14, 8, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v2, v24, 8, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v2, v30, 8, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v2, v19, 8, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v62, v61, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v62, v61, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v62, v61, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v43, v55, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v43, v55, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v33, v43, v55, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v47, v41, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v47, v41, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v47, v41, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v58, v40, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v58, v40, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v58, v40, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v60, v54, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v60, v54, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v60, v54, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v51, v44, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v51, v44, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v51, v44, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v25, v45, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v25, v45, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v25, v45, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v26, v46, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v26, v46, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v26, v46, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v27, v56, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v27, v56, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v27, v56, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v28, v57, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v28, v57, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v28, v57, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v35, v42, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v35, v42, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v35, v42, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v23, v59, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v23, v59, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v23, v59, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v9, v7, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v9, v7, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v2, v9, v7, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v3, v15, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v3, v15, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v3, v15, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v1, v16, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v1, v16, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v1, v16, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v62 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v43 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v47 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v58 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v60 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v51 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v27 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v23 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v9 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v5 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v53, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v14, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v17, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: .LBB47_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB47_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v38 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v29 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v15 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v16 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v15 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v19 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v14 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v8 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v14 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v49 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v14 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v14 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v16 -; GCN-NEXT: v_or_b32_e32 v8, v8, v19 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v48 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v14 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v19, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v14 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v18 -; GCN-NEXT: v_or_b32_e32 v8, v8, v21 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v39 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v14 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; GCN-NEXT: v_add_f32_e32 v57, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; GCN-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; GCN-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; GCN-NEXT: v_add_f32_e32 v58, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v59, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; GCN-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; GCN-NEXT: v_add_f32_e32 v60, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; GCN-NEXT: v_add_f32_e32 v61, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; GCN-NEXT: v_add_f32_e32 v62, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v47, 0x38000000, v47 -; GCN-NEXT: v_add_f32_e32 v63, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v53 -; GCN-NEXT: v_mov_b32_e32 v53, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v33 -; GCN-NEXT: v_mov_b32_e32 v33, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GCN-NEXT: v_or_b32_e32 v59, v23, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GCN-NEXT: v_or_b32_e32 v23, v19, v5 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; GCN-NEXT: v_or_b32_e32 v42, v9, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v38 -; GCN-NEXT: v_or_b32_e32 v35, v35, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; GCN-NEXT: v_or_b32_e32 v57, v28, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v51 -; GCN-NEXT: v_or_b32_e32 v28, v27, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v36 -; GCN-NEXT: v_or_b32_e32 v56, v13, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v29 -; GCN-NEXT: v_or_b32_e32 v27, v22, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v30 -; GCN-NEXT: v_or_b32_e32 v46, v24, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v32 -; GCN-NEXT: v_or_b32_e32 v26, v26, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v45, v34, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v37 -; GCN-NEXT: v_or_b32_e32 v25, v25, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v62 -; GCN-NEXT: v_or_b32_e32 v44, v12, v14 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_bfe_u32 v5, v62, 8, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v51, v40, v38 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v5, v18, 8, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v54, v54, v13 -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v5, v60, 8, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_or_b32_e32 v60, v52, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v1, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v40, v50, v22 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v58, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_or_b32_e32 v58, v43, v30 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v41, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_or_b32_e32 v41, v61, v24 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v55, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v47, v47, v32 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v49, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v55, v4, v31 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v48, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v43, v3, v36 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v11, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v61, v2, v34 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v21, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v62, v15, v37 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v20, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v53, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v33, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v19, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v62, v61, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v62, v61, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v62, v61, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v43, v55, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v43, v55, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v33, v43, v55, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v47, v41, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v47, v41, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v47, v41, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v58, v40, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v58, v40, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v58, v40, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v60, v54, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v60, v54, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v60, v54, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v44, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v44, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v44, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v25, v45, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v25, v45, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v25, v45, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v26, v46, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v26, v46, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v26, v46, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v27, v56, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v27, v56, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v27, v56, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v28, v57, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v28, v57, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v28, v57, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v35, v42, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v35, v42, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v35, v42, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v23, v59, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v23, v59, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v23, v59, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 8 -; GCN-NEXT: v_mov_b32_e32 v5, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 8 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v3, v15, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v3, v15, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v3, v15, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v16, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v16, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v16, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v53, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v17, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: .LBB47_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v61 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v62 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v4 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v33 -; GCN-NEXT: v_or_b32_e32 v29, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v43 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v30, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v41 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v2, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v61, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v40 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v62, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v58 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v54 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v44 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v7, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v51 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v8, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v9, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v25 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v10, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v46 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v11, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v26 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v12, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v56 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v13, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v27 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v22, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v57 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v24, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v28 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v25, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v42 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v26, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v27, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v59 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v28, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v23 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v23, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_mov_b32_e32 v5, v19 -; GCN-NEXT: v_or_b32_e32 v19, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v21, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v17 -; GCN-NEXT: v_or_b32_e32 v17, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v20, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v15 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v15, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v53 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v18, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v16, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v31, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v33, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v32, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v34, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v35, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v36, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v37, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v38, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v39, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v48, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v49, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v50, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v51, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v52, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v53, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v54, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v55, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v40, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v41, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v42, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v43, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v44, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v45, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v46, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v56, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v57, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v58, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v59, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v60, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v4, v1, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v5, v1, v33 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v63, v2, v32 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v61 -; GCN-NEXT: v_or_b32_e32 v61, v3, v34 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v6, v6, v35 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v36 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v29, v37 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v29, vcc, 28, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v30, v38 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v39 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v48 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v8, v8, v49 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v50 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v11, v11, v52 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v12, v12, v53 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v13, v13, v54 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v22, v22, v55 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v40 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v25, v25, v41 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v42 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v27, v27, v43 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v23, v23, v44 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v19, v45 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v21, v46 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v17, v17, v47 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v20, v56 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v15, v15, v57 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v18, v58 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v59 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v16, v16, v60 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64f16_to_v128i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v46, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_mov_b32_e32 v45, v46 +; SI-NEXT: v_mov_b32_e32 v46, v6 +; SI-NEXT: v_mov_b32_e32 v6, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB94_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v44, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v41, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v54, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v53, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v51, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v52, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v49, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v50, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v48, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v39, v5, v14 +; SI-NEXT: v_alignbit_b32 v5, v41, v44, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v37, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v38, v5, v14 +; SI-NEXT: v_alignbit_b32 v5, v41, v44, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v35, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v36, v5, v14 +; SI-NEXT: v_alignbit_b32 v5, v41, v44, 8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v34, v5, v14 +; SI-NEXT: v_alignbit_b32 v5, v53, v54, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v5, v14 +; SI-NEXT: v_alignbit_b32 v5, v53, v54, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v5, v14 +; SI-NEXT: v_alignbit_b32 v5, v53, v54, 8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v24, v47, v14 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v25, v5, v14 +; SI-NEXT: v_alignbit_b32 v5, v52, v51, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v59 +; SI-NEXT: v_or_b32_e32 v22, v58, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v56 +; SI-NEXT: v_or_b32_e32 v23, v57, v5 +; SI-NEXT: v_alignbit_b32 v5, v52, v51, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_or_b32_e32 v20, v61, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v21, v60, v5 +; SI-NEXT: v_alignbit_b32 v5, v52, v51, 8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_or_b32_e32 v18, v40, v5 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; SI-NEXT: v_or_b32_e32 v19, v55, v5 +; SI-NEXT: v_alignbit_b32 v5, v50, v49, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 +; SI-NEXT: v_or_b32_e32 v16, v1, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_or_b32_e32 v17, v42, v1 +; SI-NEXT: v_alignbit_b32 v1, v50, v49, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v14, v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v15, v2, v1 +; SI-NEXT: v_alignbit_b32 v1, v50, v49, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v37, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v37, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v37, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v30, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v30, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v30, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v29, v28, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v29, v28, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v29, v28, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v27, v26, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v27, v26, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v27, v26, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v24, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v24, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v24, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v20, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v20, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v20, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v33, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v32, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v31, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v11, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v7, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v56, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v47, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v40, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v46, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: .LBB94_2: ; %Flow +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB94_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v14, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v16, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v17, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v18, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v19, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v20, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v21, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v22, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v23, v1, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_or_b32_e32 v24, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v26, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v28, v4, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v27, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_or_b32_e32 v29, v1, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_or_b32_e32 v30, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v34, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v35, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v37, v4, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v36, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_or_b32_e32 v38, v1, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v48, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v39, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v49, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v51, v4, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v50, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_or_b32_e32 v52, v1, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_or_b32_e32 v54, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v53, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v44, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_or_b32_e32 v41, v3, v1 +; SI-NEXT: v_alignbit_b32 v1, v41, v44, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v44, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v52, v51, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v52, v51, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v52, v51, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v50, v49, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v50, v49, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v50, v49, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v37, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v37, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v37, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v30, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v30, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v30, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v29, v28, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v29, v28, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v29, v28, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v27, v26, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v27, v26, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v27, v26, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v24, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v24, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v24, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v20, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v20, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v20, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v33, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v32, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v31, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v11, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v7, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v42, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v55, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v40, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v46, 8, 8 +; SI-NEXT: v_alignbit_b32 v5, v41, v44, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: .LBB94_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v51 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v128i8: ; VI: ; %bb.0: @@ -95500,7 +193443,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB47_2 +; VI-NEXT: s_cbranch_execz .LBB94_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v16 ; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill @@ -95644,9 +193587,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v33, v31 ; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[19:20] ; VI-NEXT: v_mov_b32_e32 v51, v34 -; VI-NEXT: .LBB47_2: ; %Flow +; VI-NEXT: .LBB94_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB47_4 +; VI-NEXT: s_cbranch_execz .LBB94_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v63, 0x200 ; VI-NEXT: v_add_f16_sdwa v31, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -95984,7 +193927,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v32, v41, 8, 8 ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: .LBB47_4: ; %end +; VI-NEXT: .LBB94_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) @@ -96576,7 +194519,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB47_2 +; GFX9-NEXT: s_cbranch_execz .LBB94_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill @@ -96759,9 +194702,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 ; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] ; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; GFX9-NEXT: .LBB47_2: ; %Flow +; GFX9-NEXT: .LBB94_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB47_4 +; GFX9-NEXT: s_cbranch_execz .LBB94_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] @@ -96977,7 +194920,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 -; GFX9-NEXT: .LBB47_4: ; %end +; GFX9-NEXT: .LBB94_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -97417,7 +195360,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] @@ -97484,9 +195427,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 -; GFX11-TRUE16-NEXT: .LBB47_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB94_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB47_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] @@ -97586,7 +195529,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 -; GFX11-TRUE16-NEXT: .LBB47_4: ; %end +; GFX11-TRUE16-NEXT: .LBB94_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l @@ -98034,7 +195977,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] @@ -98133,462 +196076,7092 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] -; GFX11-FAKE16-NEXT: .LBB47_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB94_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB47_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB94_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v17 +; GFX11-FAKE16-NEXT: .LBB94_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v60 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v67, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v62 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v64, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v64, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v151 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v117 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v64, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v97 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v82 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v73 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v39, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v46 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v41 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v180 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v166 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v38, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v132 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v36, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v113 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v87 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v85 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v71 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v33, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: s_clause 0x5 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64f16_to_v128i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 +; SI-NEXT: v_mov_b32_e32 v59, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v51 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v52 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v51, v55 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v40 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v40, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v57 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB95_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v13, v13, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v44 +; SI-NEXT: v_or_b32_e32 v55, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v57, v16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_or_b32_e32 v17, v14, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v16, v19, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_or_b32_e32 v19, v23, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; SI-NEXT: v_or_b32_e32 v47, v60, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_or_b32_e32 v43, v42, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; SI-NEXT: v_or_b32_e32 v14, v63, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v42, v58, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v60, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_or_b32_e32 v22, v2, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_or_b32_e32 v12, v46, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v2, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_or_b32_e32 v34, v34, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_or_b32_e32 v3, v59, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_or_b32_e32 v59, v56, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v62, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_or_b32_e32 v62, v25, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_or_b32_e32 v2, v27, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_or_b32_e32 v25, v28, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v36, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_or_b32_e32 v23, v35, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v39, v11 +; SI-NEXT: v_mov_b32_e32 v36, v2 +; SI-NEXT: v_mov_b32_e32 v35, v1 +; SI-NEXT: v_alignbit_b32 v1, v55, v13, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v55, v13, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v55, v13, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v57, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v57, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v57, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v16, v21, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v16, v21, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v16, v21, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v47, v19, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v47, v19, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v47, v19, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v14, v43, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v14, v43, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v14, v43, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v60, v42, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v60, v42, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v60, v42, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v24, v22, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v24, v22, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v24, v22, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v4, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v4, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v4, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v59, v3, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v59, v3, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v59, v3, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v26, 8, v34 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v61, v50, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v49 +; SI-NEXT: v_or_b32_e32 v2, v48, v11 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v62, v49, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v62, v49, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v62, v49, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v36, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v36, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v36, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v35, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v35, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v61, v18, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v61, v18, 16 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; SI-NEXT: v_or_b32_e32 v58, v54, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_or_b32_e32 v6, v53, v11 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v61, v18, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v58, v2, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v58, v2, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v61 +; SI-NEXT: v_or_b32_e32 v54, v40, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v44, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v20, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v15, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v29, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v33, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v32, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v31, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v5, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v38, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v48, 8, 8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v37, 8, 8 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v52, 8, 8 +; SI-NEXT: v_alignbit_b32 v28, v58, v2, 24 +; SI-NEXT: v_alignbit_b32 v2, v54, v6, 24 +; SI-NEXT: v_alignbit_b32 v39, v54, v6, 16 +; SI-NEXT: v_alignbit_b32 v40, v54, v6, 8 +; SI-NEXT: v_alignbit_b32 v27, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v56, v12, v11, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 8 +; SI-NEXT: v_mov_b32_e32 v20, v29 +; SI-NEXT: v_mov_b32_e32 v15, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v32, v31 +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: v_mov_b32_e32 v10, v9 +; SI-NEXT: v_mov_b32_e32 v9, v7 +; SI-NEXT: v_bfe_u32 v29, v7, 8, 8 +; SI-NEXT: v_mov_b32_e32 v7, v8 +; SI-NEXT: v_mov_b32_e32 v8, v5 +; SI-NEXT: v_mov_b32_e32 v44, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_branch .LBB95_3 +; SI-NEXT: .LBB95_2: +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mov_b32_e32 v20, v29 +; SI-NEXT: v_mov_b32_e32 v15, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v32, v31 +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: v_mov_b32_e32 v10, v9 +; SI-NEXT: v_mov_b32_e32 v9, v7 +; SI-NEXT: v_mov_b32_e32 v7, v8 +; SI-NEXT: v_mov_b32_e32 v8, v5 +; SI-NEXT: v_mov_b32_e32 v44, v37 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: .LBB95_3: ; %Flow +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, v44 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v5, v8 +; SI-NEXT: v_mov_b32_e32 v6, v7 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: v_mov_b32_e32 v8, v10 +; SI-NEXT: v_mov_b32_e32 v9, v31 +; SI-NEXT: v_mov_b32_e32 v31, v33 +; SI-NEXT: v_mov_b32_e32 v44, v15 +; SI-NEXT: v_mov_b32_e32 v33, v20 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v50, v2 +; SI-NEXT: v_mov_b32_e32 v53, v40 +; SI-NEXT: v_mov_b32_e32 v40, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mov_b32_e32 v2, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v11, v27 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: v_mov_b32_e32 v27, v52 +; SI-NEXT: v_mov_b32_e32 v30, v29 +; SI-NEXT: v_mov_b32_e32 v29, v26 +; SI-NEXT: s_cbranch_vccnz .LBB95_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 +; SI-NEXT: v_or_b32_e32 v56, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v36, v14, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v54, v14, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v52, v17, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v58, v17, v19 +; SI-NEXT: v_alignbit_b32 v40, v58, v52, 24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v11, v21, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v61, v21, v22 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v16, v23, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v48, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v53, v26, v27 +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v62, v28, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v27, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v8 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v63 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v59, v29, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v3, v30, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v44 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_or_b32_e32 v4, v34, v30 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v32 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_mov_b32_e32 v30, v10 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: v_or_b32_e32 v34, v35, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v46 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_or_b32_e32 v22, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v10 +; SI-NEXT: v_mov_b32_e32 v35, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v24, v37, v36 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v38 +; SI-NEXT: v_or_b32_e32 v42, v39, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 +; SI-NEXT: v_mov_b32_e32 v36, v48 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v60, v37, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v49 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_alignbit_b32 v39, v54, v29, 16 +; SI-NEXT: v_or_b32_e32 v43, v48, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v14, v49, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v28, v14, v43, 8 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_or_b32_e32 v19, v48, v37 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v33 +; SI-NEXT: v_or_b32_e32 v47, v49, v37 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 +; SI-NEXT: v_or_b32_e32 v21, v50, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_or_b32_e32 v16, v37, v49 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_alignbit_b32 v50, v54, v29, 24 +; SI-NEXT: v_or_b32_e32 v57, v48, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_or_b32_e32 v17, v49, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_mov_b32_e32 v49, v53 +; SI-NEXT: v_alignbit_b32 v53, v54, v29, 8 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_or_b32_e32 v13, v48, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_or_b32_e32 v55, v51, v37 +; SI-NEXT: v_alignbit_b32 v10, v55, v13, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v55, v13, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v17, v57, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v17, v57, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v17, v57, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v16, v21, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v16, v21, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v16, v21, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v47, v19, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v47, v19, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v47, v19, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v14, v43, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v14, v43, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v60, v42, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v60, v42, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v60, v42, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v24, v22, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v24, v22, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v24, v22, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v34, v4, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v34, v4, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v34, v4, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v59, v3, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v59, v3, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v59, v3, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v62, v49, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v62, v49, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v62, v49, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v25, v36, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v25, v36, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v25, v36, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v23, v35, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v23, v35, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v23, v35, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v61, v11, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v61, v11, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v10, v61, v11, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v58, v52, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v10, v58, v52, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v56 +; SI-NEXT: v_alignbit_b32 v11, v12, v10, 24 +; SI-NEXT: v_alignbit_b32 v56, v12, v10, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v38, v12, v10, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v55 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v17 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v47 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v14 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v60 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v59 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v62 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v25 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v23 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v61 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v58 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v54 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v12 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v20, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v18, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v15, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v33, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v44, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v31, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v30, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v9, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v8, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v6, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v5, 8, 8 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v10, v26, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v2, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v1, 8, 8 +; SI-NEXT: v_alignbit_b32 v48, v55, v13, 24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 +; SI-NEXT: v_bfe_u32 v30, v7, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v27, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: .LBB95_5: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v37, 0xff, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v48 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v37, v37, v51 +; SI-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v51, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_or_b32_e32 v51, v52, v51 +; SI-NEXT: v_or_b32_e32 v37, v37, v51 +; SI-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v37, 0xff, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v37, v37, v51 +; SI-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v10 +; SI-NEXT: v_or_b32_e32 v20, v48, v20 +; SI-NEXT: v_or_b32_e32 v20, v37, v20 +; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v20, v37, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v37, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v10 +; SI-NEXT: v_or_b32_e32 v37, v48, v37 +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v20, v37, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v10 +; SI-NEXT: v_or_b32_e32 v18, v37, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v10 +; SI-NEXT: v_or_b32_e32 v20, v37, v20 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_or_b32_e32 v15, v20, v15 +; SI-NEXT: v_or_b32_e32 v15, v18, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v28 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v13 +; SI-NEXT: v_or_b32_e32 v10, v18, v10 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v10, v15, v10 +; SI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v10, v15, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v13 +; SI-NEXT: v_or_b32_e32 v15, v18, v15 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v10, v15, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v29 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v13 +; SI-NEXT: v_or_b32_e32 v9, v15, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v3 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v15, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v3 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v4 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v30 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v36 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v4 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v4 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v35 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v4 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v61 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v4 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v40 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v53 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v128i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v63, s30, 0 +; VI-NEXT: v_writelane_b32 v63, s31, 1 +; VI-NEXT: v_writelane_b32 v63, s34, 2 +; VI-NEXT: v_writelane_b32 v63, s35, 3 +; VI-NEXT: v_writelane_b32 v63, s36, 4 +; VI-NEXT: v_writelane_b32 v63, s37, 5 +; VI-NEXT: v_writelane_b32 v63, s38, 6 +; VI-NEXT: v_writelane_b32 v63, s39, 7 +; VI-NEXT: v_writelane_b32 v63, s48, 8 +; VI-NEXT: v_writelane_b32 v63, s49, 9 +; VI-NEXT: v_writelane_b32 v63, s50, 10 +; VI-NEXT: v_writelane_b32 v63, s51, 11 +; VI-NEXT: v_writelane_b32 v63, s52, 12 +; VI-NEXT: v_writelane_b32 v63, s53, 13 +; VI-NEXT: v_writelane_b32 v63, s54, 14 +; VI-NEXT: v_writelane_b32 v63, s55, 15 +; VI-NEXT: v_writelane_b32 v63, s64, 16 +; VI-NEXT: v_writelane_b32 v63, s65, 17 +; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_writelane_b32 v63, s68, 20 +; VI-NEXT: v_writelane_b32 v63, s69, 21 +; VI-NEXT: v_writelane_b32 v63, s70, 22 +; VI-NEXT: v_writelane_b32 v63, s71, 23 +; VI-NEXT: v_writelane_b32 v63, s80, 24 +; VI-NEXT: v_writelane_b32 v63, s81, 25 +; VI-NEXT: v_writelane_b32 v63, s82, 26 +; VI-NEXT: v_writelane_b32 v63, s83, 27 +; VI-NEXT: v_writelane_b32 v63, s84, 28 +; VI-NEXT: v_writelane_b32 v63, s85, 29 +; VI-NEXT: v_writelane_b32 v63, s86, 30 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_writelane_b32 v63, s87, 31 +; VI-NEXT: v_readfirstlane_b32 s44, v3 +; VI-NEXT: v_readfirstlane_b32 s45, v4 +; VI-NEXT: v_readfirstlane_b32 s42, v5 +; VI-NEXT: v_readfirstlane_b32 s43, v6 +; VI-NEXT: v_readfirstlane_b32 s40, v7 +; VI-NEXT: v_readfirstlane_b32 s41, v8 +; VI-NEXT: v_readfirstlane_b32 s14, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s12, v11 +; VI-NEXT: v_readfirstlane_b32 s13, v12 +; VI-NEXT: v_readfirstlane_b32 s10, v13 +; VI-NEXT: v_readfirstlane_b32 s11, v14 +; VI-NEXT: v_readfirstlane_b32 s8, v15 +; VI-NEXT: v_readfirstlane_b32 s9, v16 +; VI-NEXT: v_readfirstlane_b32 s6, v17 +; VI-NEXT: v_readfirstlane_b32 s7, v18 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; VI-NEXT: s_cbranch_scc0 .LBB95_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s27, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s26, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s26, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s25, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s25, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s24, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s24, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s23, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s23, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 7 +; VI-NEXT: s_lshr_b32 s46, s23, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s22, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 6 +; VI-NEXT: s_lshr_b32 s46, s22, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 5 +; VI-NEXT: s_lshr_b32 s46, s21, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s20, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 4 +; VI-NEXT: s_lshr_b32 s46, s20, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s19, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s19, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 3 +; VI-NEXT: s_lshr_b32 s46, s19, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s18, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 2 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s17, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s17, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 1 +; VI-NEXT: s_lshr_b32 s46, s17, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s16, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 0 +; VI-NEXT: s_lshr_b32 s46, s16, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s12, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s41, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s40, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s43, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s42, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 57 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: s_lshr_b32 s52, s7, 16 +; VI-NEXT: s_lshr_b32 s53, s6, 16 +; VI-NEXT: s_lshr_b32 s84, s9, 16 +; VI-NEXT: s_lshr_b32 s85, s8, 16 +; VI-NEXT: s_lshr_b32 s80, s11, 24 +; VI-NEXT: s_lshr_b32 s86, s11, 16 +; VI-NEXT: s_lshr_b32 s87, s10, 16 +; VI-NEXT: s_lshr_b32 s81, s13, 24 +; VI-NEXT: s_lshr_b32 s54, s13, 16 +; VI-NEXT: s_lshr_b32 s55, s12, 16 +; VI-NEXT: s_lshr_b32 s82, s15, 24 +; VI-NEXT: s_lshr_b32 s64, s15, 16 +; VI-NEXT: s_lshr_b32 s65, s14, 16 +; VI-NEXT: s_lshr_b32 s83, s41, 24 +; VI-NEXT: s_lshr_b32 s66, s41, 16 +; VI-NEXT: s_lshr_b32 s67, s40, 16 +; VI-NEXT: s_lshr_b32 s50, s43, 24 +; VI-NEXT: s_lshr_b32 s68, s43, 16 +; VI-NEXT: s_lshr_b32 s69, s42, 16 +; VI-NEXT: s_lshr_b32 s51, s45, 24 +; VI-NEXT: s_lshr_b32 s70, s45, 16 +; VI-NEXT: s_lshr_b32 s71, s44, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 56 +; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[44:45], 24 +; VI-NEXT: s_cbranch_execnz .LBB95_4 +; VI-NEXT: .LBB95_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: v_mov_b32_e32 v7, 0x200 +; VI-NEXT: v_add_f16_e32 v1, s46, v7 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v2, s45, v7 +; VI-NEXT: s_lshr_b32 s45, s44, 16 +; VI-NEXT: v_or_b32_e32 v23, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s45, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v2, s44, v7 +; VI-NEXT: s_lshr_b32 s44, s43, 16 +; VI-NEXT: v_or_b32_e32 v22, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s44, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v2, s43, v7 +; VI-NEXT: s_lshr_b32 s43, s42, 16 +; VI-NEXT: v_or_b32_e32 v25, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s43, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v2, s42, v7 +; VI-NEXT: s_lshr_b32 s42, s41, 16 +; VI-NEXT: v_or_b32_e32 v24, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s42, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v2, s41, v7 +; VI-NEXT: s_lshr_b32 s41, s40, 16 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v27, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s41, v7 +; VI-NEXT: v_add_f16_e32 v2, s40, v7 +; VI-NEXT: s_lshr_b32 s40, s15, 16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v53, s40, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v26, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; VI-NEXT: v_add_f16_e32 v2, s15, v7 +; VI-NEXT: s_lshr_b32 s15, s14, 16 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v29, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s15, v7 +; VI-NEXT: v_add_f16_e32 v2, s14, v7 +; VI-NEXT: s_lshr_b32 s14, s13, 16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v43, s14, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v28, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; VI-NEXT: v_add_f16_e32 v2, s13, v7 +; VI-NEXT: s_lshr_b32 s13, s12, 16 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s13, v7 +; VI-NEXT: v_add_f16_e32 v2, s12, v7 +; VI-NEXT: s_lshr_b32 s12, s11, 16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v37, s12, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; VI-NEXT: v_add_f16_e32 v2, s11, v7 +; VI-NEXT: s_lshr_b32 s11, s10, 16 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v31, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s11, v7 +; VI-NEXT: v_add_f16_e32 v2, s10, v7 +; VI-NEXT: s_lshr_b32 s10, s9, 16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v52, s10, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v30, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; VI-NEXT: v_add_f16_e32 v2, s9, v7 +; VI-NEXT: s_lshr_b32 s9, s8, 16 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s9, v7 +; VI-NEXT: v_add_f16_e32 v2, s8, v7 +; VI-NEXT: s_lshr_b32 s8, s7, 16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v50, s8, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; VI-NEXT: v_add_f16_e32 v2, s7, v7 +; VI-NEXT: s_lshr_b32 s7, s6, 16 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s7, v7 +; VI-NEXT: v_add_f16_e32 v8, s6, v7 +; VI-NEXT: s_lshr_b32 s6, s17, 16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v36, s6, v7 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v1, v8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; VI-NEXT: v_add_f16_e32 v9, s17, v7 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_or_b32_e32 v33, v9, v8 +; VI-NEXT: v_add_f16_e32 v8, s6, v7 +; VI-NEXT: s_lshr_b32 s6, s19, 16 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_add_f16_e32 v9, s16, v7 +; VI-NEXT: v_add_f16_e32 v38, s6, v7 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v32, v9, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; VI-NEXT: v_add_f16_e32 v9, s19, v7 +; VI-NEXT: s_lshr_b32 s6, s18, 16 +; VI-NEXT: v_or_b32_e32 v21, v9, v8 +; VI-NEXT: v_add_f16_e32 v8, s6, v7 +; VI-NEXT: s_lshr_b32 s6, s21, 16 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_add_f16_e32 v9, s18, v7 +; VI-NEXT: v_add_f16_e32 v61, s6, v7 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v20, v9, v8 +; VI-NEXT: s_lshr_b32 s7, s20, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v61 +; VI-NEXT: v_add_f16_e32 v9, s21, v7 +; VI-NEXT: v_or_b32_e32 v35, v9, v8 +; VI-NEXT: v_add_f16_e32 v8, s7, v7 +; VI-NEXT: s_lshr_b32 s6, s23, 16 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_add_f16_e32 v9, s20, v7 +; VI-NEXT: v_add_f16_e32 v45, s6, v7 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v34, v9, v8 +; VI-NEXT: s_lshr_b32 s7, s22, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v45 +; VI-NEXT: v_add_f16_e32 v9, s23, v7 +; VI-NEXT: v_or_b32_e32 v19, v9, v8 +; VI-NEXT: v_add_f16_e32 v8, s7, v7 +; VI-NEXT: s_lshr_b32 s6, s25, 16 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_add_f16_e32 v9, s22, v7 +; VI-NEXT: v_add_f16_e32 v47, s6, v7 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v18, v9, v8 +; VI-NEXT: s_lshr_b32 s7, s24, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; VI-NEXT: v_add_f16_e32 v9, s25, v7 +; VI-NEXT: v_or_b32_e32 v16, v9, v8 +; VI-NEXT: v_add_f16_e32 v8, s7, v7 +; VI-NEXT: s_lshr_b32 s6, s27, 16 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_add_f16_e32 v9, s24, v7 +; VI-NEXT: v_add_f16_e32 v57, s6, v7 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v15, v9, v8 +; VI-NEXT: s_lshr_b32 s7, s26, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v57 +; VI-NEXT: v_add_f16_e32 v9, s27, v7 +; VI-NEXT: v_or_b32_e32 v13, v9, v8 +; VI-NEXT: v_add_f16_e32 v8, s7, v7 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_add_f16_e32 v9, s26, v7 +; VI-NEXT: v_add_f16_e32 v59, s6, v7 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v12, v9, v8 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; VI-NEXT: v_add_f16_e32 v9, s29, v7 +; VI-NEXT: s_lshr_b32 s6, s5, 16 +; VI-NEXT: v_or_b32_e32 v10, v9, v8 +; VI-NEXT: v_add_f16_e32 v8, s7, v7 +; VI-NEXT: s_lshr_b32 s7, s4, 16 +; VI-NEXT: v_add_f16_e32 v51, s6, v7 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_add_f16_e32 v9, s28, v7 +; VI-NEXT: v_add_f16_e32 v54, s5, v7 +; VI-NEXT: v_add_f16_e32 v11, s7, v7 +; VI-NEXT: v_add_f16_e32 v55, s4, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v51 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v9, v9, v8 +; VI-NEXT: v_or_b32_e32 v8, v54, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v7, v55, v7 +; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v8 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v10 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v9 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v13 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v16 +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[15:16] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[12:13] +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v18 +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[18:19] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v35 +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[34:35] +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v23 +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[22:23] +; VI-NEXT: v_bfe_u32 v23, v50, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v23, v52, 8, 8 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v23, v37, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v23, v43, 8, 8 +; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[9:10] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v23, v53, 8, 8 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v24 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[24:25] +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[30:31] +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v20 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[20:21] +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v21 +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[32:33] +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v6 +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[28:29] +; VI-NEXT: v_lshrrev_b64 v[8:9], 24, v[26:27] +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v22 +; VI-NEXT: v_bfe_u32 v25, v51, 8, 8 +; VI-NEXT: v_bfe_u32 v27, v59, 8, 8 +; VI-NEXT: v_bfe_u32 v6, v57, 8, 8 +; VI-NEXT: v_bfe_u32 v12, v47, 8, 8 +; VI-NEXT: v_bfe_u32 v15, v45, 8, 8 +; VI-NEXT: v_bfe_u32 v1, v61, 8, 8 +; VI-NEXT: v_bfe_u32 v22, v38, 8, 8 +; VI-NEXT: v_bfe_u32 v2, v36, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_bfe_u32 v26, v50, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_bfe_u32 v23, v23, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_bfe_u32 v24, v24, 8, 8 +; VI-NEXT: s_branch .LBB95_5 +; VI-NEXT: .LBB95_3: +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB95_2 +; VI-NEXT: .LBB95_4: +; VI-NEXT: v_mov_b32_e32 v1, s44 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s45 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s43 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s40 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s41 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s14 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s15 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s12 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s18 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s20 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s22 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s71 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s69 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s68 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s67 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s66 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s65 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s64 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s55 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s87 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s85 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s53 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s52 +; VI-NEXT: v_readlane_b32 s6, v62, 0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 1 +; VI-NEXT: v_mov_b32_e32 v36, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 2 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 3 +; VI-NEXT: v_mov_b32_e32 v38, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 5 +; VI-NEXT: v_mov_b32_e32 v61, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 6 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 7 +; VI-NEXT: v_mov_b32_e32 v45, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 9 +; VI-NEXT: v_mov_b32_e32 v47, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 10 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 11 +; VI-NEXT: v_mov_b32_e32 v57, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 12 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 13 +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 16 +; VI-NEXT: v_mov_b32_e32 v59, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 14 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 17 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v22, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 18 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 19 +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 20 +; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 21 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 22 +; VI-NEXT: v_mov_b32_e32 v27, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 23 +; VI-NEXT: v_mov_b32_e32 v25, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 24 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 25 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 26 +; VI-NEXT: v_mov_b32_e32 v46, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 27 +; VI-NEXT: v_mov_b32_e32 v41, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 28 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s80 +; VI-NEXT: v_mov_b32_e32 v60, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 29 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s81 +; VI-NEXT: v_mov_b32_e32 v40, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 30 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s82 +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 31 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 32 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 33 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 34 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 35 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 36 +; VI-NEXT: v_mov_b32_e32 v48, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 37 +; VI-NEXT: v_mov_b32_e32 v49, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 38 +; VI-NEXT: v_mov_b32_e32 v44, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 39 +; VI-NEXT: v_mov_b32_e32 v42, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 40 +; VI-NEXT: v_mov_b32_e32 v56, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 41 +; VI-NEXT: v_mov_b32_e32 v58, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 42 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 43 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 44 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 45 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 46 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 47 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s78 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_readlane_b32 s4, v62, 48 +; VI-NEXT: v_mov_b32_e32 v31, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 49 +; VI-NEXT: v_mov_b32_e32 v30, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 50 +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 51 +; VI-NEXT: v_mov_b32_e32 v32, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 52 +; VI-NEXT: v_mov_b32_e32 v29, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 53 +; VI-NEXT: v_mov_b32_e32 v28, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 54 +; VI-NEXT: v_mov_b32_e32 v34, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 55 +; VI-NEXT: v_mov_b32_e32 v9, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 56 +; VI-NEXT: v_mov_b32_e32 v3, s88 +; VI-NEXT: v_readlane_b32 s6, v62, 15 +; VI-NEXT: v_mov_b32_e32 v21, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 57 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s70 +; VI-NEXT: v_mov_b32_e32 v43, s54 +; VI-NEXT: v_mov_b32_e32 v37, s86 +; VI-NEXT: v_mov_b32_e32 v52, s84 +; VI-NEXT: v_mov_b32_e32 v51, s6 +; VI-NEXT: v_mov_b32_e32 v54, s5 +; VI-NEXT: v_mov_b32_e32 v23, s83 +; VI-NEXT: v_mov_b32_e32 v24, s50 +; VI-NEXT: v_mov_b32_e32 v26, s51 +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_mov_b32_e32 v20, s76 +; VI-NEXT: v_mov_b32_e32 v19, s74 +; VI-NEXT: v_mov_b32_e32 v18, s72 +; VI-NEXT: v_mov_b32_e32 v17, s62 +; VI-NEXT: v_mov_b32_e32 v16, s60 +; VI-NEXT: v_mov_b32_e32 v13, s58 +; VI-NEXT: v_mov_b32_e32 v10, s56 +; VI-NEXT: v_mov_b32_e32 v7, s46 +; VI-NEXT: v_mov_b32_e32 v3, s90 +; VI-NEXT: v_mov_b32_e32 v4, s30 +; VI-NEXT: v_mov_b32_e32 v5, s34 +; VI-NEXT: v_mov_b32_e32 v8, s36 +; VI-NEXT: v_mov_b32_e32 v11, s38 +; VI-NEXT: v_mov_b32_e32 v14, s48 +; VI-NEXT: .LBB95_5: ; %end +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_readlane_b32 s87, v63, 31 +; VI-NEXT: v_readlane_b32 s86, v63, 30 +; VI-NEXT: v_readlane_b32 s85, v63, 29 +; VI-NEXT: v_readlane_b32 s84, v63, 28 +; VI-NEXT: v_readlane_b32 s83, v63, 27 +; VI-NEXT: v_readlane_b32 s82, v63, 26 +; VI-NEXT: v_readlane_b32 s81, v63, 25 +; VI-NEXT: v_readlane_b32 s80, v63, 24 +; VI-NEXT: v_readlane_b32 s71, v63, 23 +; VI-NEXT: v_readlane_b32 s70, v63, 22 +; VI-NEXT: v_readlane_b32 s69, v63, 21 +; VI-NEXT: v_readlane_b32 s68, v63, 20 +; VI-NEXT: v_readlane_b32 s67, v63, 19 +; VI-NEXT: v_readlane_b32 s66, v63, 18 +; VI-NEXT: v_readlane_b32 s65, v63, 17 +; VI-NEXT: v_readlane_b32 s64, v63, 16 +; VI-NEXT: v_readlane_b32 s55, v63, 15 +; VI-NEXT: v_readlane_b32 s54, v63, 14 +; VI-NEXT: v_readlane_b32 s53, v63, 13 +; VI-NEXT: v_readlane_b32 s52, v63, 12 +; VI-NEXT: v_readlane_b32 s51, v63, 11 +; VI-NEXT: v_readlane_b32 s50, v63, 10 +; VI-NEXT: v_readlane_b32 s49, v63, 9 +; VI-NEXT: v_readlane_b32 s48, v63, 8 +; VI-NEXT: v_readlane_b32 s39, v63, 7 +; VI-NEXT: v_readlane_b32 s38, v63, 6 +; VI-NEXT: v_readlane_b32 s37, v63, 5 +; VI-NEXT: v_readlane_b32 s36, v63, 4 +; VI-NEXT: v_readlane_b32 s35, v63, 3 +; VI-NEXT: v_readlane_b32 s34, v63, 2 +; VI-NEXT: v_readlane_b32 s31, v63, 1 +; VI-NEXT: v_readlane_b32 s30, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v58, v53, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v53, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v58, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v46 +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v46, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v20, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v19, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v22 +; VI-NEXT: v_or_b32_sdwa v19, v38, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v19, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v2, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v15 +; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v12 +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v12, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64f16_to_v128i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v63, s30, 0 +; GFX9-NEXT: v_writelane_b32 v63, s31, 1 +; GFX9-NEXT: v_writelane_b32 v63, s34, 2 +; GFX9-NEXT: v_writelane_b32 v63, s35, 3 +; GFX9-NEXT: v_writelane_b32 v63, s36, 4 +; GFX9-NEXT: v_writelane_b32 v63, s37, 5 +; GFX9-NEXT: v_writelane_b32 v63, s38, 6 +; GFX9-NEXT: v_writelane_b32 v63, s39, 7 +; GFX9-NEXT: v_writelane_b32 v63, s48, 8 +; GFX9-NEXT: v_writelane_b32 v63, s49, 9 +; GFX9-NEXT: v_writelane_b32 v63, s50, 10 +; GFX9-NEXT: v_writelane_b32 v63, s51, 11 +; GFX9-NEXT: v_writelane_b32 v63, s52, 12 +; GFX9-NEXT: v_writelane_b32 v63, s53, 13 +; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_writelane_b32 v63, s64, 16 +; GFX9-NEXT: v_writelane_b32 v63, s65, 17 +; GFX9-NEXT: v_writelane_b32 v63, s66, 18 +; GFX9-NEXT: v_writelane_b32 v63, s67, 19 +; GFX9-NEXT: v_writelane_b32 v63, s68, 20 +; GFX9-NEXT: v_writelane_b32 v63, s69, 21 +; GFX9-NEXT: v_writelane_b32 v63, s70, 22 +; GFX9-NEXT: v_writelane_b32 v63, s71, 23 +; GFX9-NEXT: v_writelane_b32 v63, s80, 24 +; GFX9-NEXT: v_writelane_b32 v63, s81, 25 +; GFX9-NEXT: v_writelane_b32 v63, s82, 26 +; GFX9-NEXT: v_writelane_b32 v63, s83, 27 +; GFX9-NEXT: v_writelane_b32 v63, s84, 28 +; GFX9-NEXT: v_writelane_b32 v63, s85, 29 +; GFX9-NEXT: v_writelane_b32 v63, s86, 30 +; GFX9-NEXT: v_writelane_b32 v63, s87, 31 +; GFX9-NEXT: v_writelane_b32 v63, s96, 32 +; GFX9-NEXT: v_writelane_b32 v63, s97, 33 +; GFX9-NEXT: v_writelane_b32 v63, s98, 34 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_writelane_b32 v63, s99, 35 +; GFX9-NEXT: v_readfirstlane_b32 s44, v3 +; GFX9-NEXT: v_readfirstlane_b32 s45, v4 +; GFX9-NEXT: v_readfirstlane_b32 s42, v5 +; GFX9-NEXT: v_readfirstlane_b32 s43, v6 +; GFX9-NEXT: v_readfirstlane_b32 s40, v7 +; GFX9-NEXT: v_readfirstlane_b32 s41, v8 +; GFX9-NEXT: v_readfirstlane_b32 s14, v9 +; GFX9-NEXT: v_readfirstlane_b32 s15, v10 +; GFX9-NEXT: v_readfirstlane_b32 s12, v11 +; GFX9-NEXT: v_readfirstlane_b32 s13, v12 +; GFX9-NEXT: v_readfirstlane_b32 s10, v13 +; GFX9-NEXT: v_readfirstlane_b32 s11, v14 +; GFX9-NEXT: v_readfirstlane_b32 s8, v15 +; GFX9-NEXT: v_readfirstlane_b32 s9, v16 +; GFX9-NEXT: v_readfirstlane_b32 s6, v17 +; GFX9-NEXT: v_readfirstlane_b32 s7, v18 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; GFX9-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s5, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 49 +; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s5, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 47 +; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 46 +; GFX9-NEXT: s_lshr_b32 s46, s4, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 45 +; GFX9-NEXT: s_lshr_b32 s46, s29, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 44 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 43 +; GFX9-NEXT: s_lshr_b32 s46, s29, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 42 +; GFX9-NEXT: s_lshr_b32 s46, s28, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s28, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s27, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s27, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s27, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s26, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s26, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s25, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s25, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s24, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s24, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s23, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s23, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s23, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s22, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s22, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s21, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s20, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s20, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s19, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s19, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s18, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s17, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s17, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s17, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 12 +; GFX9-NEXT: s_lshr_b32 s46, s16, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 11 +; GFX9-NEXT: s_lshr_b32 s46, s16, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 10 +; GFX9-NEXT: s_lshr_b32 s46, s7, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 9 +; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 8 +; GFX9-NEXT: s_lshr_b32 s46, s7, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 7 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 6 +; GFX9-NEXT: s_lshr_b32 s46, s6, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 5 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 4 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 3 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 2 +; GFX9-NEXT: s_lshr_b32 s46, s8, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 1 +; GFX9-NEXT: s_lshr_b32 s46, s8, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 0 +; GFX9-NEXT: s_lshr_b32 s82, s11, 24 +; GFX9-NEXT: s_lshr_b32 s83, s11, 16 +; GFX9-NEXT: s_lshr_b32 s85, s11, 8 +; GFX9-NEXT: s_lshr_b32 s84, s10, 16 +; GFX9-NEXT: s_lshr_b32 s86, s10, 8 +; GFX9-NEXT: s_lshr_b32 s87, s13, 24 +; GFX9-NEXT: s_lshr_b32 s96, s13, 16 +; GFX9-NEXT: s_lshr_b32 s98, s13, 8 +; GFX9-NEXT: s_lshr_b32 s97, s12, 16 +; GFX9-NEXT: s_lshr_b32 s99, s12, 8 +; GFX9-NEXT: s_lshr_b32 s38, s15, 24 +; GFX9-NEXT: s_lshr_b32 s39, s15, 16 +; GFX9-NEXT: s_lshr_b32 s49, s15, 8 +; GFX9-NEXT: s_lshr_b32 s48, s14, 16 +; GFX9-NEXT: s_lshr_b32 s50, s14, 8 +; GFX9-NEXT: s_lshr_b32 s51, s41, 24 +; GFX9-NEXT: s_lshr_b32 s52, s41, 16 +; GFX9-NEXT: s_lshr_b32 s54, s41, 8 +; GFX9-NEXT: s_lshr_b32 s53, s40, 16 +; GFX9-NEXT: s_lshr_b32 s55, s40, 8 +; GFX9-NEXT: s_lshr_b32 s64, s43, 24 +; GFX9-NEXT: s_lshr_b32 s65, s43, 16 +; GFX9-NEXT: s_lshr_b32 s67, s43, 8 +; GFX9-NEXT: s_lshr_b32 s66, s42, 16 +; GFX9-NEXT: s_lshr_b32 s68, s42, 8 +; GFX9-NEXT: s_lshr_b32 s69, s45, 24 +; GFX9-NEXT: s_lshr_b32 s70, s45, 16 +; GFX9-NEXT: s_lshr_b32 s80, s45, 8 +; GFX9-NEXT: s_lshr_b32 s71, s44, 16 +; GFX9-NEXT: s_lshr_b32 s81, s44, 8 +; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB95_4 +; GFX9-NEXT: .LBB95_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v15, 0x200 +; GFX9-NEXT: v_pk_add_f16 v26, s5, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, s4, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, s45, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, s44, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, s43, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s42, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s41, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s40, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s15, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s14, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s13, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s12, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s11, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s10, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s9, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s8, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s7, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s6, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v49, s17, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v48, s16, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v38, s19, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v37, s18, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v36, s21, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v35, s20, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v34, s23, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v33, s22, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, s25, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v31, s24, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, s27, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, s26, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, s29, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, s28, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[33:34] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[35:36] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[37:38] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[48:49] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v3 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v8 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v28 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v30 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v30 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v29 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v32 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v32 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v34 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v33 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v36 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v36 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v35 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v38 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v38 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v38 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v4 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v21 +; GFX9-NEXT: s_branch .LBB95_5 +; GFX9-NEXT: .LBB95_3: +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr99 +; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr96 +; GFX9-NEXT: ; implicit-def: $sgpr87 +; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr85 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: s_branch .LBB95_2 +; GFX9-NEXT: .LBB95_4: +; GFX9-NEXT: v_mov_b32_e32 v15, s71 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s80 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s70 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s69 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s68 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s66 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s67 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s65 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s64 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s55 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s53 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s54 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s52 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s51 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s50 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s48 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s49 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s39 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s38 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s99 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s97 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s98 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s96 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s87 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s86 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s84 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s85 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s83 +; GFX9-NEXT: v_mov_b32_e32 v25, s4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s82 +; GFX9-NEXT: v_readlane_b32 s4, v62, 0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 1 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 2 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 3 +; GFX9-NEXT: v_mov_b32_e32 v55, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 5 +; GFX9-NEXT: v_mov_b32_e32 v53, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 6 +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 7 +; GFX9-NEXT: v_mov_b32_e32 v51, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 8 +; GFX9-NEXT: v_mov_b32_e32 v50, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 9 +; GFX9-NEXT: v_mov_b32_e32 v24, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 10 +; GFX9-NEXT: v_mov_b32_e32 v20, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 11 +; GFX9-NEXT: v_mov_b32_e32 v42, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 12 +; GFX9-NEXT: v_mov_b32_e32 v18, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 13 +; GFX9-NEXT: v_mov_b32_e32 v39, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 14 +; GFX9-NEXT: v_mov_b32_e32 v43, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 15 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 16 +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 17 +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 18 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 19 +; GFX9-NEXT: v_mov_b32_e32 v54, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 20 +; GFX9-NEXT: v_mov_b32_e32 v61, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 21 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 22 +; GFX9-NEXT: v_mov_b32_e32 v60, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 23 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 24 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 25 +; GFX9-NEXT: v_mov_b32_e32 v23, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 26 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 27 +; GFX9-NEXT: v_mov_b32_e32 v59, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 28 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 29 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 30 +; GFX9-NEXT: v_mov_b32_e32 v58, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 31 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 32 +; GFX9-NEXT: v_mov_b32_e32 v57, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 33 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 34 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 35 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 36 +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 37 +; GFX9-NEXT: v_mov_b32_e32 v56, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 38 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 39 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 40 +; GFX9-NEXT: v_mov_b32_e32 v47, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 41 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 42 +; GFX9-NEXT: v_mov_b32_e32 v46, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 43 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 44 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 45 +; GFX9-NEXT: v_mov_b32_e32 v45, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 46 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 47 +; GFX9-NEXT: v_mov_b32_e32 v44, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 48 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 49 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s46 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s56 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s58 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s60 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s62 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s72 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s74 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s76 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s78 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s88 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s90 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s92 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s94 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s30 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s34 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s36 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v21, s44 +; GFX9-NEXT: v_mov_b32_e32 v22, s45 +; GFX9-NEXT: v_mov_b32_e32 v13, s42 +; GFX9-NEXT: v_mov_b32_e32 v14, s43 +; GFX9-NEXT: v_mov_b32_e32 v11, s40 +; GFX9-NEXT: v_mov_b32_e32 v12, s41 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: v_mov_b32_e32 v10, s15 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v8, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v48, s16 +; GFX9-NEXT: v_mov_b32_e32 v49, s17 +; GFX9-NEXT: v_mov_b32_e32 v37, s18 +; GFX9-NEXT: v_mov_b32_e32 v38, s19 +; GFX9-NEXT: v_mov_b32_e32 v35, s20 +; GFX9-NEXT: v_mov_b32_e32 v36, s21 +; GFX9-NEXT: v_mov_b32_e32 v33, s22 +; GFX9-NEXT: v_mov_b32_e32 v34, s23 +; GFX9-NEXT: v_mov_b32_e32 v31, s24 +; GFX9-NEXT: v_mov_b32_e32 v32, s25 +; GFX9-NEXT: v_mov_b32_e32 v29, s26 +; GFX9-NEXT: v_mov_b32_e32 v30, s27 +; GFX9-NEXT: v_mov_b32_e32 v27, s28 +; GFX9-NEXT: v_mov_b32_e32 v28, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s5 +; GFX9-NEXT: v_mov_b32_e32 v41, v50 +; GFX9-NEXT: v_mov_b32_e32 v50, v51 +; GFX9-NEXT: v_mov_b32_e32 v51, v52 +; GFX9-NEXT: v_mov_b32_e32 v52, v53 +; GFX9-NEXT: v_mov_b32_e32 v53, v55 +; GFX9-NEXT: v_mov_b32_e32 v55, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s81 +; GFX9-NEXT: .LBB95_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v15 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v16, v37, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v35, v35, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v36, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v48, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v23, v33, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_readlane_b32 s99, v63, 35 +; GFX9-NEXT: v_readlane_b32 s98, v63, 34 +; GFX9-NEXT: v_readlane_b32 s97, v63, 33 +; GFX9-NEXT: v_readlane_b32 s96, v63, 32 +; GFX9-NEXT: v_readlane_b32 s87, v63, 31 +; GFX9-NEXT: v_readlane_b32 s86, v63, 30 +; GFX9-NEXT: v_readlane_b32 s85, v63, 29 +; GFX9-NEXT: v_readlane_b32 s84, v63, 28 +; GFX9-NEXT: v_readlane_b32 s83, v63, 27 +; GFX9-NEXT: v_readlane_b32 s82, v63, 26 +; GFX9-NEXT: v_readlane_b32 s81, v63, 25 +; GFX9-NEXT: v_readlane_b32 s80, v63, 24 +; GFX9-NEXT: v_readlane_b32 s71, v63, 23 +; GFX9-NEXT: v_readlane_b32 s70, v63, 22 +; GFX9-NEXT: v_readlane_b32 s69, v63, 21 +; GFX9-NEXT: v_readlane_b32 s68, v63, 20 +; GFX9-NEXT: v_readlane_b32 s67, v63, 19 +; GFX9-NEXT: v_readlane_b32 s66, v63, 18 +; GFX9-NEXT: v_readlane_b32 s65, v63, 17 +; GFX9-NEXT: v_readlane_b32 s64, v63, 16 +; GFX9-NEXT: v_readlane_b32 s55, v63, 15 +; GFX9-NEXT: v_readlane_b32 s54, v63, 14 +; GFX9-NEXT: v_readlane_b32 s53, v63, 13 +; GFX9-NEXT: v_readlane_b32 s52, v63, 12 +; GFX9-NEXT: v_readlane_b32 s51, v63, 11 +; GFX9-NEXT: v_readlane_b32 s50, v63, 10 +; GFX9-NEXT: v_readlane_b32 s49, v63, 9 +; GFX9-NEXT: v_readlane_b32 s48, v63, 8 +; GFX9-NEXT: v_readlane_b32 s39, v63, 7 +; GFX9-NEXT: v_readlane_b32 s38, v63, 6 +; GFX9-NEXT: v_readlane_b32 s37, v63, 5 +; GFX9-NEXT: v_readlane_b32 s36, v63, 4 +; GFX9-NEXT: v_readlane_b32 s35, v63, 3 +; GFX9-NEXT: v_readlane_b32 s34, v63, 2 +; GFX9-NEXT: v_readlane_b32 s31, v63, 1 +; GFX9-NEXT: v_readlane_b32 s30, v63, 0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v15, v38, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v19, v42, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v19, v39, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v40, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v36, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v28, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v53, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v64f16_to_v128i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:88 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s30, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s99, 3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s100, 4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s101, 5 +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_clause 0x12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s102, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s103, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s104, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s50, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s51, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s52, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s53, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s54, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s55, 15 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s64, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s65, 17 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s66, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s67, 19 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s68, 20 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s69, 21 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s70, 22 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s71, 23 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s80, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s81, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s82, 26 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s83, 27 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s84, 28 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s85, 29 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s86, 30 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s87, 31 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[0:1], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s5, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s5, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s5, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s4, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s7, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s7, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s6, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s9, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s8, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s8, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s11, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s10, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s13, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s13, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s12, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s12, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s15, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s15, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s15, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s14, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s41, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s41, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s41, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s40, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s29, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s28, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[62:63], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[72:73], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[40:41], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s2, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s2, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s0, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s0, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s74, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s75, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB95_4 +; GFX11-TRUE16-NEXT: .LBB95_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v51, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v50, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v36, 0x200, s19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, s29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s40 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v55, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v49, 0x200, s17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v48, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, s24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[35:36] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[50:51] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[31:32] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[48:49] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[54:55] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 16, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 16, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 24, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 16, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 8, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 16, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 16, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v21 +; GFX11-TRUE16-NEXT: s_branch .LBB95_5 +; GFX11-TRUE16-NEXT: .LBB95_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s43, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB95_2 +; GFX11-TRUE16-NEXT: .LBB95_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 10 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, s34 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, s104 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, s103 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, s102 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 11 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, s101 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, s100 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, s99 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, s98 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 12 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, s97 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, s96 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, s87 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, s86 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, s85 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, s84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.l, s83 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.l, s82 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, s81 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, s80 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, s71 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.l, s70 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, s69 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, s68 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, s67 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, s66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, s65 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s64 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, s52 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, s51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s72 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s74 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s76 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s78 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s88 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s90 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s92 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 24 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 26 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 28 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 30 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 4 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s1, v78, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s0 +; GFX11-TRUE16-NEXT: .LBB95_5: ; %end +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v74 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xff, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v62 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, v54, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v60, 0xff, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v69, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v72 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, v60, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v57, 8, v57 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, v55, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xff, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, v54, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v69, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v50, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v59 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xff, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, v55, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v51, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v81, v57 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xff, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v56, v50, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v48, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v81, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xff, v45 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v44 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v57, v50, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, v49, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v70, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v43 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v42 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v48, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, v35, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v69, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v40 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, v49, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v36, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v67, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v182 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, v67, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v180 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v35, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v36, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v176 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v167 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v36, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, v67, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[54:57], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[48:51], off offset:16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xff, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v160 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v151 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v66, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, v36, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v49, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v27, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v147 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xff, v146 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v150 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xff, v149 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v38, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v49, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v52, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, v31, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v27, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v28, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v21, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v144 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v134 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v32, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v131 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v17, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v129 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v32, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v128 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xff, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v116 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v49, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v51, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, v21, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v17, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v18, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v102 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v103 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v18, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v101 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v96 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v27, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v14, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v9, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xff, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v165 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, v70, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v24, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v26, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v67, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v23, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v20 +; GFX11-TRUE16-NEXT: s_clause 0x5 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[35:38], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[48:51], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[15:18], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 +; GFX11-TRUE16-NEXT: s_clause 0x12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:72 +; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v76, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v76, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v76, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v76, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v76, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v76, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v76, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v76, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v76, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v75, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v75, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v75, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v75, 28 +; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v75, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v75, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v75, 25 +; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v75, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v75, 23 +; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v75, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v75, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v75, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v75, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v75, 18 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v75, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v75, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v75, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v75, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v75, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v75, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v75, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v75, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v75, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v75, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v75, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v75, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v75, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v75, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v75, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v75, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v75, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v75, 0 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:88 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:88 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s30, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s99, 3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s100, 4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s101, 5 +; GFX11-FAKE16-NEXT: s_mov_b32 s99, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s102, 6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s103, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s104, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s49, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s50, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s51, 11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s52, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s53, 13 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s54, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s55, 15 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s64, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s65, 17 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s66, 18 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s67, 19 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s68, 20 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s69, 21 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s70, 22 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s71, 23 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s80, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s81, 25 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s82, 26 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s83, 27 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s84, 28 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s85, 29 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s86, 30 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s87, 31 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[0:1], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s5, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s5, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s5, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s4, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s7, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s7, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s9, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s8, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s8, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s11, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s11, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s10, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s13, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s13, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s12, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s12, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s15, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s15, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s15, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s14, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s41, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s41, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s41, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s40, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s40, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s29, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s28, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[72:73], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[40:41], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s2, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s2, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s0, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s0, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s74, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s75, 1 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s99 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB95_4 +; GFX11-FAKE16-NEXT: .LBB95_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v39, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v38, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v51, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v50, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v33, 0x200, s21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, s23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, s22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, s20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s41 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s40 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v53, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v52, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v37, 0x200, s19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v36, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[38:39] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[28:29] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[32:33] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[50:51] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[20:21] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[24:25] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[36:37] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[52:53] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 24, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 24, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 8, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 8, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 24, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 8, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 16, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 8, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v12 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v17 -; GFX11-FAKE16-NEXT: .LBB47_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v74 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v60 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v47 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v67, v54 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v62 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v57 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v43 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v64, v67 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v64, v53 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v179 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v177 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v164 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v151 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v144 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v133 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v129 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v119 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v117 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v116 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v64, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v15 +; GFX11-FAKE16-NEXT: s_branch .LBB95_5 +; GFX11-FAKE16-NEXT: .LBB95_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: s_mov_b32 s99, -1 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s43, 1 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: s_branch .LBB95_2 +; GFX11-FAKE16-NEXT: .LBB95_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s0 :: v_dual_mov_b32 v53, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v71, s50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v14, s41 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v74, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s14 :: v_dual_mov_b32 v12, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s12 :: v_dual_mov_b32 v10, s13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v73, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v55, s48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s10 :: v_dual_mov_b32 v8, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s8 :: v_dual_mov_b32 v6, s9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v72, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v49, s39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v4, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v62, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s2 :: v_dual_mov_b32 v51, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s16 :: v_dual_mov_b32 v39, s17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v63, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s38 :: v_dual_mov_b32 v36, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, s19 :: v_dual_mov_b32 v32, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s21 :: v_dual_mov_b32 v60, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_mov_b32 v29, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v61, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s26 :: v_dual_mov_b32 v21, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v146, s42 :: v_dual_mov_b32 v145, s104 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v59, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v144, vcc_hi :: v_dual_mov_b32 v135, s103 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v134, s102 :: v_dual_mov_b32 v133, s101 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v57, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s36 :: v_dual_mov_b32 v132, s98 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v131, s100 :: v_dual_mov_b32 v130, s97 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v129, s96 :: v_dual_mov_b32 v58, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s37 :: v_dual_mov_b32 v128, s87 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v119, s85 :: v_dual_mov_b32 v118, s86 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v117, s84 :: v_dual_mov_b32 v56, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v116, s83 :: v_dual_mov_b32 v115, s82 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v114, s80 :: v_dual_mov_b32 v113, s81 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v47, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s35 :: v_dual_mov_b32 v112, s71 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v103, s70 :: v_dual_mov_b32 v102, s69 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v101, s67 :: v_dual_mov_b32 v46, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s34 :: v_dual_mov_b32 v100, s68 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v99, s66 :: v_dual_mov_b32 v98, s65 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s64 :: v_dual_mov_b32 v44, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s54 :: v_dual_mov_b32 v87, s55 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v85, s52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v45, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, s51 :: v_dual_mov_b32 v83, s49 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v147, s43 :: v_dual_mov_b32 v22, s78 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v43, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v67, s58 :: v_dual_mov_b32 v26, s88 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v81, s44 :: v_dual_mov_b32 v30, s90 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v42, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s74 :: v_dual_mov_b32 v34, s92 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v65, s94 :: v_dual_mov_b32 v68, s30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v41, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v48, s62 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v54, s72 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v64, s60 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s56 :: v_dual_mov_b32 v183, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 21 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v80, s46 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v18, s76 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v40, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v182, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 23 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v181, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v180, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v178, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v179, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v177, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 28 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v176, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 29 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v167, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v165, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 31 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v166, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v164, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v163, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v162, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v160, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v161, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v151, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v150, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v149, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v148, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s1, v78, 1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v82, s0 +; GFX11-FAKE16-NEXT: .LBB95_5: ; %end +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v52, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v57, 0xff, v57 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v69, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v72 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v53, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v62 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v69, v82 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v60, 0xff, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, v50, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, v60, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v52, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v59 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v52, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v82 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v66, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v57, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v52, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, v38, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v82, v80 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xff, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v53, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v39, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v81, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v38, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v80, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v82, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xff, v183 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v38, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v70, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v81, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v181 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, v80, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, v38, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v66, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v178 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v179 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v165 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v165, 8, v166 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, v67, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v80, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v82, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v69 ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v100 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v97 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v82 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v70 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v75 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v73 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v39, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v59 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v46 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v44 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v41 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v40 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v182 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v180 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v176 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v166 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v38, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v160 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v148 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[50:53], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[36:39], off offset:16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v32, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v33, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, v28, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v29, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v163 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xff, v160 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 8, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v50, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xff, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 8, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v50, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, v53, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v20, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v21, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v15, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v144 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v132 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v36, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v25, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v25, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v24 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v113 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v99 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v87 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v85 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v71 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v69 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v31, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v33, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v33, v34 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v13, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, v14, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v113 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v112 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v20, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v26, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v21, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v10, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v5, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v20, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v22, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v17 ; GFX11-FAKE16-NEXT: s_clause 0x5 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[36:39], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[50:53], off offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[64:67], off offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:72 +; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v76, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v76, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v76, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v76, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v76, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v76, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v76, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v76, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v76, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v75, 31 +; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v75, 30 +; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v75, 29 +; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v75, 28 +; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v75, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v75, 26 +; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v75, 25 +; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v75, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v75, 23 +; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v75, 22 +; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v75, 21 +; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v75, 20 +; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v75, 19 +; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v75, 18 +; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v75, 17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v75, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v75, 15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v75, 14 +; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v75, 13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v75, 12 +; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v75, 11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v75, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v75, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v75, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v75, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v75, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v75, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v75, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v75, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v75, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v75, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v75, 0 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -98609,2336 +203182,2164 @@ end: } define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v128i8_to_v64i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v49, v7 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v55, v1 -; GCN-NEXT: v_mov_b32_e32 v60, v0 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v14 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 24, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v22 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 24, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v30 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v63, 24, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v10 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:392 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v23 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v21 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v19 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v9 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v13 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v7 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v24, 8, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 8, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:344 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:376 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:372 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:368 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 8, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:328 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:360 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:384 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v4 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; kill: killed $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; kill: killed $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; kill: killed $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; kill: killed $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; kill: killed $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; kill: killed $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; kill: killed $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB48_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v32, v1, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v28, v1, v28 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v30 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v1 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v17 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v25, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v25, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v25, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v25, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v25, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v25, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v25, v45 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v13, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v13, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v12, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v12, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v56, v48, v12 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v50, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v2, v36, v13 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v53, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v3, v40, v22 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v47, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v1, v26 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v1, v63, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v1, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v1, v38 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v61, v1, v48 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v59, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v44, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v58, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v46, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v45, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v41, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v47, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v54, v1, v54 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v30 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v1 -; GCN-NEXT: v_mov_b32_e32 v1, v56 -; GCN-NEXT: v_or_b32_e32 v8, v49, v1 -; GCN-NEXT: v_or_b32_e32 v31, v51, v2 -; GCN-NEXT: v_or_b32_e32 v56, v57, v3 -; GCN-NEXT: v_or_b32_e32 v4, v55, v0 -; GCN-NEXT: v_or_b32_e32 v5, v40, v62 -; GCN-NEXT: v_or_b32_e32 v6, v37, v17 -; GCN-NEXT: v_or_b32_e32 v7, v25, v61 -; GCN-NEXT: v_or_b32_e32 v37, v34, v59 -; GCN-NEXT: v_or_b32_e32 v25, v9, v63 -; GCN-NEXT: v_or_b32_e32 v38, v10, v44 -; GCN-NEXT: v_or_b32_e32 v51, v11, v52 -; GCN-NEXT: v_or_b32_e32 v55, v12, v50 -; GCN-NEXT: v_or_b32_e32 v49, v13, v48 -; GCN-NEXT: v_or_b32_e32 v40, v14, v41 -; GCN-NEXT: v_or_b32_e32 v11, v15, v39 -; GCN-NEXT: v_or_b32_e32 v57, v16, v53 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v36, v12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v18, v14 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v19, v13 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v20, v34 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v21, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v22, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v23, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v23 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v33, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v58 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v27, v46 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v45 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v29, v43 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v30, v42 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v35, v47 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v32, v54 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; GCN-NEXT: v_mov_b32_e32 v1, v8 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v62 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v61 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v59 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v63 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v44 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v52 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v50 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v48 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v41 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v39 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v61, v1, v12, 16 -; GCN-NEXT: v_mov_b32_e32 v12, v33 -; GCN-NEXT: v_alignbit_b32 v33, v31, v14, 16 -; GCN-NEXT: v_mov_b32_e32 v27, v56 -; GCN-NEXT: v_alignbit_b32 v59, v56, v13, 16 -; GCN-NEXT: v_mov_b32_e32 v13, v19 -; GCN-NEXT: v_alignbit_b32 v29, v4, v34, 16 -; GCN-NEXT: v_alignbit_b32 v0, v5, v20, 16 -; GCN-NEXT: v_alignbit_b32 v14, v6, v21, 16 -; GCN-NEXT: v_mov_b32_e32 v21, v18 -; GCN-NEXT: v_mov_b32_e32 v18, v10 -; GCN-NEXT: v_mov_b32_e32 v56, v16 -; GCN-NEXT: v_mov_b32_e32 v16, v36 -; GCN-NEXT: v_mov_b32_e32 v10, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v7, v22, 16 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v62, v37 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v37, v23, 16 -; GCN-NEXT: v_mov_b32_e32 v23, v9 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v10, v58, 16 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v51 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v51, v46, 16 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v58, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v55, v45, 16 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v52, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v49, v43, 16 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v46, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v40, v42, 16 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v41, v11 -; GCN-NEXT: v_alignbit_b32 v11, v11, v47, 16 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v3, v57, v54, 16 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v53 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; kill: killed $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; kill: killed $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; kill: killed $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: .LBB48_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v25, v27 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB48_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v30, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v38, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v1, v20, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v1, v51, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v1, v28, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v6, v22, v7 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v1, v8 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v8, v45, v9 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v9, v24, v10 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v1, v13 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v1, v14 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v1, v18 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v1, v19 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v1, v20 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v1, v21 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v1, v22 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_mov_b32_e32 v2, v35 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v1, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v17 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_mov_b32_e32 v4, v49 -; GCN-NEXT: v_mov_b32_e32 v43, v42 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_mov_b32_e32 v44, v40 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v46, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v57, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v1, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v59, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v61, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v23 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v24, v23 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v24, v23 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v24, v23 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v24, v23 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_mov_b32_e32 v52, v36 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v24, v23 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_mov_b32_e32 v45, v53 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v24, v23 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v55, v24, v23 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v27, v25 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v31, v29 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v32, v31 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v17 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v54 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v17 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v36, v34 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v11 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v37, v2, v37 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v2, v49 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v63, v54 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v43 -; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v40, v2, v40 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v41, v47, v41 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v43, v44, v43 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 3, v15 -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_or_b32_e32 v45, v52, v45 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v3 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_or_b32_e32 v47, v50, v47 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v4 -; GCN-NEXT: v_and_b32_e32 v56, 0xff, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v56, v48, v56 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v61, v61, v2 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v0, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v1, v4 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s7, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s7, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, s7, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, s7, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, s7, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, s7, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, s7, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, s7, v38 -; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v59 -; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v62 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v51 -; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v38, v1 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v38, v6 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v38, v7 -; GCN-NEXT: v_or_b32_e32 v8, v39, v8 -; GCN-NEXT: v_or_b32_e32 v9, v53, v9 -; GCN-NEXT: v_or_b32_e32 v10, v55, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: v_or_b32_e32 v13, v25, v13 -; GCN-NEXT: v_or_b32_e32 v14, v27, v14 -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: v_or_b32_e32 v16, v29, v16 -; GCN-NEXT: v_or_b32_e32 v17, v31, v17 -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: v_or_b32_e32 v19, v33, v19 -; GCN-NEXT: v_or_b32_e32 v20, v34, v20 -; GCN-NEXT: v_or_b32_e32 v21, v36, v21 -; GCN-NEXT: v_or_b32_e32 v22, v37, v22 -; GCN-NEXT: v_or_b32_e32 v24, v49, v26 -; GCN-NEXT: v_or_b32_e32 v25, v54, v30 -; GCN-NEXT: v_or_b32_e32 v26, v40, v35 -; GCN-NEXT: v_or_b32_e32 v28, v41, v51 -; GCN-NEXT: v_or_b32_e32 v30, v43, v42 -; GCN-NEXT: v_or_b32_e32 v33, v44, v46 -; GCN-NEXT: v_or_b32_e32 v34, v45, v57 -; GCN-NEXT: v_or_b32_e32 v38, v47, v58 -; GCN-NEXT: v_or_b32_e32 v39, v56, v59 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s6, v61 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v32, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v49, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v35, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v36, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v51, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v53, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v30 -; GCN-NEXT: v_add_i32_e32 v56, vcc, s6, v33 -; GCN-NEXT: v_add_i32_e32 v31, vcc, s6, v34 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v38 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v39 -; GCN-NEXT: v_alignbit_b32 v61, v1, v16, 16 -; GCN-NEXT: v_alignbit_b32 v33, v31, v56, 16 -; GCN-NEXT: v_alignbit_b32 v59, v25, v18, 16 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v29, v4, v8, 16 -; GCN-NEXT: v_alignbit_b32 v0, v5, v21, 16 -; GCN-NEXT: v_alignbit_b32 v2, v6, v13, 16 -; GCN-NEXT: v_alignbit_b32 v19, v7, v23, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v19, v17, v14, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v19, v15, v12, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v19, v10, v53, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v19, v11, v52, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v19, v48, v51, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v19, v36, v50, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v19, v35, v49, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v19, v3, v32, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v19, v27, v37, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v31 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v62, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v10 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v58, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v48 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v52, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v46, v35 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v41, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v27 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; GCN-NEXT: .LBB48_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v61 -; GCN-NEXT: v_or_b32_e32 v8, v8, v19 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v19 -; GCN-NEXT: buffer_store_dword v8, v60, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v8, vcc, 4, v60 -; GCN-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v33 -; GCN-NEXT: v_or_b32_e32 v56, v1, v8 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v44, v1, v2 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 8, v60 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; GCN-NEXT: v_or_b32_e32 v63, v1, v2 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 12, v60 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GCN-NEXT: v_or_b32_e32 v59, v1, v2 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 16, v60 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 20, v60 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v4, vcc, 24, v60 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v60 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v5 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v9, vcc, 32, v60 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; GCN-NEXT: v_or_b32_e32 v61, v2, v3 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 36, v60 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v6 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v47, v2, v3 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 40, v60 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v23, v3, v5 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 48, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v15, v3, v5 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v57, v3, v5 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v12 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v21, v3, v5 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v37 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v3, v5 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v25, v3, v5 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x44, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v10 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v27, v3, v5 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x48, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v30, v3, v5 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x4c, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v32, v3, v5 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x50, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v34, v3, v5 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x54, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v58 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v29, v3, v5 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x58, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v37, v3, v5 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x5c, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v3, v5 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v48, v3, v7 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v46 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v50, v3, v7 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v55, v3, v7 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x6c, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v41 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v54, v3, v7 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x70, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v41, v3, v7 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v52, v3, v7 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x78, v60 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v60 -; GCN-NEXT: buffer_store_dword v56, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v128i8_to_v64i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:392 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v30 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v8 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:360 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v9 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:332 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v33 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v34 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v9 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v7 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v7 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v9 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:348 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:344 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:356 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:352 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:376 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:364 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB96_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v20, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v41, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v55, v24, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v24, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v13, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v26, v30, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v21, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v32, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v35, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v31, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v63, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v48, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v50, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v53, v4, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v1, v4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v43, v10, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v44, v4, v10 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v10, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v46, v10, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v56, v4, v10 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v10, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v57, v10, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v61, v4, v10 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v10, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v49, v10, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v4, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v10, v22 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v33, v10, v22 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: v_mov_b32_e32 v10, v29 +; SI-NEXT: v_or_b32_e32 v29, v22, v34 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v34, v37, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v37, v51, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v51, v22, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v47, v59, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v54, v22, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v22, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v39, v22, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v42, v22, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v59, v22, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v60, v22, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v36, v22, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v10, v22, v10 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v38, v22, v24 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v58, v22, v24 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v52, v22, v24 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v22, v24 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v40, v22, v40 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v15, v22, v15 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v45 +; SI-NEXT: v_or_b32_e32 v45, v22, v62 +; SI-NEXT: v_mov_b32_e32 v62, v18 +; SI-NEXT: v_or_b32_e32 v20, v20, v62 +; SI-NEXT: v_or_b32_e32 v22, v19, v3 +; SI-NEXT: v_alignbit_b32 v3, v20, v3, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v3, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v3, v9 +; SI-NEXT: v_alignbit_b32 v3, v5, v9, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v41, v3, v55 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v18, v3, v13 +; SI-NEXT: v_alignbit_b32 v3, v41, v13, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v13, v3, v26 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v39, v3, v21 +; SI-NEXT: v_alignbit_b32 v3, v13, v21, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v32, v3, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v42, v3, v27 +; SI-NEXT: v_alignbit_b32 v3, v32, v27, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v27, v3, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v60 +; SI-NEXT: v_or_b32_e32 v3, v3, v31 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v27, v31, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v21, v3, v48 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v31, v3, v50 +; SI-NEXT: v_alignbit_b32 v3, v21, v50, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v3, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v11, v53, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_or_b32_e32 v6, v6, v43 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v3, v43, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v9, v6, v16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v6, v6, v46 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v9, v46, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v63, v6, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v6, v6, v57 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v63, v57, 16 +; SI-NEXT: v_or_b32_e32 v57, v4, v23 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v52, v4, v33 +; SI-NEXT: v_alignbit_b32 v4, v57, v33, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v46, v4, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v38, v4, v37 +; SI-NEXT: v_alignbit_b32 v4, v46, v37, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v44, v4, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_or_b32_e32 v36, v4, v47 +; SI-NEXT: v_alignbit_b32 v4, v44, v47, 16 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v54 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_or_b32_e32 v43, v4, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_or_b32_e32 v12, v4, v14 +; SI-NEXT: v_alignbit_b32 v4, v43, v14, 16 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; SI-NEXT: v_mov_b32_e32 v23, v12 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v61 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_or_b32_e32 v61, v6, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v24 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_or_b32_e32 v58, v6, v49 +; SI-NEXT: v_alignbit_b32 v6, v61, v49, 16 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v34 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB96_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB96_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60 +; SI-NEXT: v_or_b32_e32 v2, v8, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_or_b32_e32 v3, v19, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v43, vcc, s7, v2 +; SI-NEXT: v_mov_b32_e32 v28, v24 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v15, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_or_b32_e32 v4, v59, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v47, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v44, vcc, s7, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v44 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v12, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v5 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v46, vcc, s7, v6 +; SI-NEXT: v_add_i32_e32 v34, vcc, s7, v3 +; SI-NEXT: v_mov_b32_e32 v36, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v7 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v57, vcc, s7, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v9 +; SI-NEXT: v_mov_b32_e32 v58, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v61, vcc, s7, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v63, vcc, s7, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v14 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v46 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v18 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v31, vcc, s7, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v23 +; SI-NEXT: v_mov_b32_e32 v23, v1 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_or_b32_e32 v24, v24, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v25, v25, v21 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v20 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v28, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v41, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v57 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v1 +; SI-NEXT: v_alignbit_b32 v1, v20, v22, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v5, v25, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v18, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v13, v39, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v26, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v27, v15, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v31, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v11, v14, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v3, v12, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v9, v10, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v63, v8, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v61, v7, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v57, v6, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v46, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v44, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v43, v23, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v43 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: .LBB96_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v56 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v64i16: ; VI: ; %bb.0: @@ -101272,7 +205673,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: s_cbranch_execz .LBB96_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload @@ -101754,9 +206155,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: .LBB48_2: ; %Flow +; VI-NEXT: .LBB96_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: s_cbranch_execz .LBB96_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload @@ -102143,7 +206544,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v39, 0x300, v48 ; VI-NEXT: v_or_b32_e32 v21, v39, v21 ; VI-NEXT: v_or_b32_e32 v31, v31, v54 -; VI-NEXT: .LBB48_4: ; %end +; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -102519,7 +206920,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB48_2 +; GFX9-NEXT: s_cbranch_execz .LBB96_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload @@ -103002,9 +207403,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: .LBB48_2: ; %Flow +; GFX9-NEXT: .LBB96_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB48_4 +; GFX9-NEXT: s_cbranch_execz .LBB96_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload @@ -103395,7 +207796,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v26, v37, v26, s6 ; GFX9-NEXT: v_perm_b32 v27, v36, v27, s6 ; GFX9-NEXT: v_perm_b32 v28, v35, v28, s6 -; GFX9-NEXT: .LBB48_4: ; %end +; GFX9-NEXT: .LBB96_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -103640,15 +208041,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB48_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB96_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB48_4 -; GFX11-TRUE16-NEXT: .LBB48_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB96_4 +; GFX11-TRUE16-NEXT: .LBB96_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB48_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB96_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.l @@ -103907,8 +208308,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 -; GFX11-TRUE16-NEXT: .LBB48_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_2 +; GFX11-TRUE16-NEXT: .LBB96_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.l, 3 @@ -104182,2316 +208583,19275 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16: +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:516 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:392 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v30 :: v_dual_mov_b32 v54, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v55, v28 :: v_dual_mov_b32 v52, v26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v22 :: v_dual_mov_b32 v53, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v51, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v39, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v150, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:152 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:20 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v29 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v182 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v40 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v43 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v44 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v45 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v46 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v47 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v56 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v58 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v59 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v44, 8, v60 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v61 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v62 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v63 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v72 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v73 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v74 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v75 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v76 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v77 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v78 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v79 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v89 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v90 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v76, 8, v95 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v104 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v105 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v94 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v51 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v161 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v167 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v7, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v9, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v11, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v150 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v56 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v10, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v12, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v14, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v16, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v148 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v58 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v73 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v13, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v15, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v17, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v19, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v21, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v75 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v78 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v77 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v79 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v89 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v92 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v91 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v20, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v104 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v95 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v105 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v94 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v108 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v107 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v110 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v109 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v23, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v29, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v31, v30, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v111 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v106 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v122 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v121 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v123 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v120 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v125 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v124 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v126 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v127 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v32, v31, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v34, v33, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v36, v35, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 +; GFX11-FAKE16-NEXT: .LBB96_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v131, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v116, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v35, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v126, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v127, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v125, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v124, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v33, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v98, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v116, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v98, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v112, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v99, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v103, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v120, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v122, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v121, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v111, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v81, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v81, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v101, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v97, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v106, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v110, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v109, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v108, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v94, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v54, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v52, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v53, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v50, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v49, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v49, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v48, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v32, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v39, 0x300, v1 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v33, v38, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v32, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v34, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v52, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 +; GFX11-FAKE16-NEXT: .LBB96_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:580 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v128i8_to_v64i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:256 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:244 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:196 +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_writelane_b32 v63, s36, 4 +; SI-NEXT: v_writelane_b32 v63, s37, 5 +; SI-NEXT: v_writelane_b32 v63, s38, 6 +; SI-NEXT: v_writelane_b32 v63, s39, 7 +; SI-NEXT: v_writelane_b32 v63, s48, 8 +; SI-NEXT: v_writelane_b32 v63, s49, 9 +; SI-NEXT: v_writelane_b32 v63, s50, 10 +; SI-NEXT: v_writelane_b32 v63, s51, 11 +; SI-NEXT: v_writelane_b32 v63, s52, 12 +; SI-NEXT: v_writelane_b32 v63, s53, 13 +; SI-NEXT: v_writelane_b32 v63, s54, 14 +; SI-NEXT: v_writelane_b32 v63, s55, 15 +; SI-NEXT: v_writelane_b32 v63, s64, 16 +; SI-NEXT: v_writelane_b32 v63, s65, 17 +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v63, s66, 18 +; SI-NEXT: v_writelane_b32 v62, s28, 0 +; SI-NEXT: v_writelane_b32 v63, s67, 19 +; SI-NEXT: v_writelane_b32 v62, s27, 1 +; SI-NEXT: v_writelane_b32 v63, s68, 20 +; SI-NEXT: v_writelane_b32 v62, s25, 2 +; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_writelane_b32 v62, s24, 3 +; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_writelane_b32 v62, s23, 4 +; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_writelane_b32 v62, s22, 5 +; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: v_writelane_b32 v62, s21, 6 +; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_writelane_b32 v62, s20, 7 +; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_writelane_b32 v62, s19, 8 +; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_writelane_b32 v62, s18, 9 +; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: v_writelane_b32 v62, s16, 10 +; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v29, v5 +; SI-NEXT: v_readfirstlane_b32 s76, v18 +; SI-NEXT: v_readfirstlane_b32 s40, v25 +; SI-NEXT: v_readfirstlane_b32 s16, v24 +; SI-NEXT: v_readfirstlane_b32 s42, v23 +; SI-NEXT: v_readfirstlane_b32 s52, v20 +; SI-NEXT: v_readfirstlane_b32 s8, v19 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s50, v35 +; SI-NEXT: v_readfirstlane_b32 s31, v36 +; SI-NEXT: v_readfirstlane_b32 s53, v37 +; SI-NEXT: v_readfirstlane_b32 s82, v48 +; SI-NEXT: v_readfirstlane_b32 s7, v49 +; SI-NEXT: v_readfirstlane_b32 s79, v52 +; SI-NEXT: v_readfirstlane_b32 s78, v55 +; SI-NEXT: v_readfirstlane_b32 s4, v41 +; SI-NEXT: v_writelane_b32 v62, s4, 11 +; SI-NEXT: v_readfirstlane_b32 s4, v43 +; SI-NEXT: v_writelane_b32 v62, s4, 12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:136 +; SI-NEXT: v_readfirstlane_b32 s4, v45 +; SI-NEXT: v_writelane_b32 v62, s4, 13 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s4, v56 +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: v_readfirstlane_b32 s4, v57 +; SI-NEXT: v_writelane_b32 v62, s4, 15 +; SI-NEXT: v_readfirstlane_b32 s4, v60 +; SI-NEXT: v_readfirstlane_b32 s86, v31 +; SI-NEXT: v_readfirstlane_b32 s36, v32 +; SI-NEXT: v_readfirstlane_b32 s71, v33 +; SI-NEXT: v_readfirstlane_b32 s77, v59 +; SI-NEXT: v_writelane_b32 v62, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v61 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 +; SI-NEXT: v_readfirstlane_b32 s98, v50 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s67, v53 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 +; SI-NEXT: v_writelane_b32 v62, s4, 17 +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v62, s4, 18 +; SI-NEXT: v_readfirstlane_b32 s81, v34 +; SI-NEXT: v_readfirstlane_b32 s75, v39 +; SI-NEXT: v_readfirstlane_b32 s68, v42 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 +; SI-NEXT: v_readfirstlane_b32 s49, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 +; SI-NEXT: v_readfirstlane_b32 s51, v54 +; SI-NEXT: v_readfirstlane_b32 s97, v51 +; SI-NEXT: v_readfirstlane_b32 s35, v27 +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s28, v28 +; SI-NEXT: v_readfirstlane_b32 s87, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_readfirstlane_b32 s96, v17 +; SI-NEXT: v_readfirstlane_b32 s99, v16 +; SI-NEXT: v_readfirstlane_b32 s89, v15 +; SI-NEXT: v_readfirstlane_b32 s88, v12 +; SI-NEXT: v_readfirstlane_b32 s30, v11 +; SI-NEXT: v_readfirstlane_b32 s64, v10 +; SI-NEXT: v_readfirstlane_b32 s55, v9 +; SI-NEXT: v_readfirstlane_b32 s65, v8 +; SI-NEXT: v_readfirstlane_b32 s80, v7 +; SI-NEXT: v_readfirstlane_b32 s21, v2 +; SI-NEXT: v_readfirstlane_b32 s74, v1 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 +; SI-NEXT: v_readfirstlane_b32 s93, v36 +; SI-NEXT: v_readfirstlane_b32 s24, v37 +; SI-NEXT: v_readfirstlane_b32 s27, v48 +; SI-NEXT: v_readfirstlane_b32 s84, v43 +; SI-NEXT: v_readfirstlane_b32 s83, v44 +; SI-NEXT: v_readfirstlane_b32 s85, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s20, v47 +; SI-NEXT: v_readfirstlane_b32 s4, v58 +; SI-NEXT: v_writelane_b32 v62, s4, 19 +; SI-NEXT: v_readfirstlane_b32 s23, v49 +; SI-NEXT: v_readfirstlane_b32 s92, v52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 +; SI-NEXT: v_readfirstlane_b32 s90, v35 +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: v_readfirstlane_b32 s38, v32 +; SI-NEXT: v_readfirstlane_b32 s70, v33 +; SI-NEXT: v_readfirstlane_b32 s54, v59 +; SI-NEXT: v_readfirstlane_b32 s57, v60 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s56, v61 +; SI-NEXT: v_readfirstlane_b32 s59, v55 +; SI-NEXT: v_readfirstlane_b32 s61, v41 +; SI-NEXT: v_readfirstlane_b32 s19, v45 +; SI-NEXT: v_readfirstlane_b32 s34, v50 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 +; SI-NEXT: v_writelane_b32 v62, s4, 20 +; SI-NEXT: v_readfirstlane_b32 s25, v53 +; SI-NEXT: v_readfirstlane_b32 s91, v40 +; SI-NEXT: v_readfirstlane_b32 s37, v34 +; SI-NEXT: v_readfirstlane_b32 s47, v56 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s46, v57 +; SI-NEXT: v_readfirstlane_b32 s22, v42 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v22 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v5 +; SI-NEXT: v_readfirstlane_b32 s72, v39 +; SI-NEXT: v_readfirstlane_b32 s94, v51 +; SI-NEXT: v_readfirstlane_b32 s48, v54 +; SI-NEXT: v_readfirstlane_b32 s66, v43 +; SI-NEXT: v_readfirstlane_b32 s69, v44 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s45, v46 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_writelane_b32 v62, s4, 21 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: v_writelane_b32 v62, s4, 22 +; SI-NEXT: v_writelane_b32 v62, s17, 23 +; SI-NEXT: v_writelane_b32 v62, s40, 24 +; SI-NEXT: v_writelane_b32 v62, s16, 25 +; SI-NEXT: v_writelane_b32 v62, s42, 26 +; SI-NEXT: v_writelane_b32 v62, s46, 27 +; SI-NEXT: v_writelane_b32 v62, s47, 28 +; SI-NEXT: v_writelane_b32 v62, s56, 29 +; SI-NEXT: v_writelane_b32 v62, s57, 30 +; SI-NEXT: v_writelane_b32 v62, s45, 31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s58, v52 +; SI-NEXT: v_writelane_b32 v62, s49, 32 +; SI-NEXT: v_writelane_b32 v62, s58, 33 +; SI-NEXT: v_writelane_b32 v62, s59, 34 +; SI-NEXT: v_writelane_b32 v62, s52, 35 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s60, v38 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_writelane_b32 v62, s60, 36 +; SI-NEXT: v_writelane_b32 v62, s61, 37 +; SI-NEXT: v_writelane_b32 v62, s93, 38 +; SI-NEXT: v_writelane_b32 v62, s8, 39 +; SI-NEXT: v_readfirstlane_b32 s62, v47 +; SI-NEXT: v_writelane_b32 v62, s72, 40 +; SI-NEXT: v_readfirstlane_b32 s73, v58 +; SI-NEXT: v_writelane_b32 v62, s62, 41 +; SI-NEXT: v_writelane_b32 v62, s73, 42 +; SI-NEXT: v_writelane_b32 v62, s35, 43 +; SI-NEXT: v_writelane_b32 v62, s94, 44 +; SI-NEXT: v_writelane_b32 v62, s48, 45 +; SI-NEXT: v_writelane_b32 v62, s91, 46 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v60 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v31 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v59 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v45 +; SI-NEXT: v_lshlrev_b32_e32 v43, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v55 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v50 +; SI-NEXT: v_writelane_b32 v62, s66, 47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v18 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB97_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v21, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s4, v62, 7 +; SI-NEXT: v_readlane_b32 s5, v62, 6 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_writelane_b32 v62, s4, 48 +; SI-NEXT: v_readlane_b32 s4, v62, 5 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s5, v62, 4 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_or_b32 s63, s5, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 9 +; SI-NEXT: s_and_b32 s5, s4, 0xff +; SI-NEXT: v_readlane_b32 s4, v62, 8 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s9, s4, 24 +; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: s_or_b32 s9, s9, s5 +; SI-NEXT: s_and_b32 s5, s4, 0xff +; SI-NEXT: s_lshl_b32 s10, s29, 8 +; SI-NEXT: s_or_b32 s4, s5, s10 +; SI-NEXT: v_writelane_b32 v62, s4, 49 +; SI-NEXT: s_and_b32 s5, s74, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_writelane_b32 v62, s54, 50 +; SI-NEXT: s_lshl_b32 s11, s21, 24 +; SI-NEXT: s_mov_b32 s18, s22 +; SI-NEXT: s_mov_b32 s22, s21 +; SI-NEXT: s_or_b32 s21, s11, s5 +; SI-NEXT: s_and_b32 s11, s26, 0xff +; SI-NEXT: v_readlane_b32 s4, v62, 1 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s12, s4, 24 +; SI-NEXT: s_or_b32 s14, s12, s11 +; SI-NEXT: s_and_b32 s11, s80, 0xff +; SI-NEXT: s_lshl_b32 s12, s65, 8 +; SI-NEXT: s_or_b32 s12, s11, s12 +; SI-NEXT: s_and_b32 s11, s55, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s13, s64, 24 +; SI-NEXT: s_or_b32 s41, s13, s11 +; SI-NEXT: s_and_b32 s11, s89, 0xff +; SI-NEXT: s_lshl_b32 s13, s99, 8 +; SI-NEXT: s_or_b32 s13, s11, s13 +; SI-NEXT: s_and_b32 s11, s96, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s76, 24 +; SI-NEXT: s_or_b32 s43, s15, s11 +; SI-NEXT: s_and_b32 s11, s42, 0xff +; SI-NEXT: s_lshl_b32 s15, s16, 8 +; SI-NEXT: s_or_b32 s16, s11, s15 +; SI-NEXT: s_and_b32 s11, s40, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s87, 24 +; SI-NEXT: s_or_b32 s44, s15, s11 +; SI-NEXT: s_and_b32 s11, s73, 0xff +; SI-NEXT: s_lshl_b32 s15, s62, 8 +; SI-NEXT: s_or_b32 s62, s11, s15 +; SI-NEXT: s_and_b32 s11, s58, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s45, 24 +; SI-NEXT: s_or_b32 s45, s15, s11 +; SI-NEXT: s_and_b32 s11, s48, 0xff +; SI-NEXT: s_lshl_b32 s15, s94, 8 +; SI-NEXT: s_or_b32 s10, s11, s15 +; SI-NEXT: s_and_b32 s11, s46, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s47, 24 +; SI-NEXT: s_or_b32 s46, s15, s11 +; SI-NEXT: s_and_b32 s11, s25, 0xff +; SI-NEXT: s_lshl_b32 s15, s34, 8 +; SI-NEXT: s_or_b32 s94, s11, s15 +; SI-NEXT: s_and_b32 s11, s72, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s60, 24 +; SI-NEXT: s_or_b32 s47, s15, s11 +; SI-NEXT: s_and_b32 s11, s61, 0xff +; SI-NEXT: s_lshl_b32 s15, s59, 8 +; SI-NEXT: s_or_b32 s73, s11, s15 +; SI-NEXT: s_and_b32 s11, s56, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s57, 24 +; SI-NEXT: v_readlane_b32 s4, v62, 20 +; SI-NEXT: s_or_b32 s56, s15, s11 +; SI-NEXT: s_and_b32 s11, s38, 0xff +; SI-NEXT: s_lshl_b32 s15, s4, 8 +; SI-NEXT: s_or_b32 s48, s11, s15 +; SI-NEXT: s_and_b32 s11, s92, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s23, 24 +; SI-NEXT: s_or_b32 vcc_lo, s15, s11 +; SI-NEXT: s_and_b32 s11, s20, 0xff +; SI-NEXT: s_lshl_b32 s15, s85, 8 +; SI-NEXT: s_or_b32 s72, s11, s15 +; SI-NEXT: s_and_b32 s11, s83, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s84, 24 +; SI-NEXT: s_or_b32 vcc_hi, s15, s11 +; SI-NEXT: s_and_b32 s11, s93, 0xff +; SI-NEXT: s_lshl_b32 s15, s97, 8 +; SI-NEXT: s_or_b32 s57, s11, s15 +; SI-NEXT: s_and_b32 s11, s67, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s51, 24 +; SI-NEXT: v_writelane_b32 v62, s67, 51 +; SI-NEXT: s_mov_b32 s67, s51 +; SI-NEXT: s_mov_b32 s51, s74 +; SI-NEXT: s_or_b32 s74, s15, s11 +; SI-NEXT: s_and_b32 s11, s98, 0xff +; SI-NEXT: s_lshl_b32 s15, s75, 8 +; SI-NEXT: v_readlane_b32 s4, v62, 18 +; SI-NEXT: v_writelane_b32 v62, s87, 52 +; SI-NEXT: s_or_b32 s58, s11, s15 +; SI-NEXT: s_and_b32 s11, s4, 0xff +; SI-NEXT: v_writelane_b32 v62, s25, 53 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s81, 24 +; SI-NEXT: v_readlane_b32 s4, v62, 15 +; SI-NEXT: s_mov_b32 s54, s75 +; SI-NEXT: s_or_b32 s75, s15, s11 +; SI-NEXT: s_and_b32 s11, s77, 0xff +; SI-NEXT: s_lshl_b32 s15, s4, 8 +; SI-NEXT: v_readlane_b32 s4, v62, 14 +; SI-NEXT: s_or_b32 s59, s11, s15 +; SI-NEXT: s_and_b32 s11, s4, 0xff +; SI-NEXT: v_readlane_b32 s4, v62, 13 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s4, 24 +; SI-NEXT: v_readlane_b32 s4, v62, 12 +; SI-NEXT: s_mov_b32 s95, s69 +; SI-NEXT: s_mov_b32 s69, s76 +; SI-NEXT: s_or_b32 s76, s15, s11 +; SI-NEXT: s_and_b32 s11, s86, 0xff +; SI-NEXT: s_lshl_b32 s15, s4, 8 +; SI-NEXT: v_readlane_b32 s4, v62, 11 +; SI-NEXT: s_or_b32 s60, s11, s15 +; SI-NEXT: s_and_b32 s11, s4, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s78, 24 +; SI-NEXT: s_mov_b32 s93, s99 +; SI-NEXT: s_mov_b32 s99, s84 +; SI-NEXT: s_mov_b32 s84, s77 +; SI-NEXT: s_or_b32 s77, s15, s11 +; SI-NEXT: s_and_b32 s11, s82, 0xff +; SI-NEXT: s_lshl_b32 s15, s53, 8 +; SI-NEXT: s_or_b32 s61, s11, s15 +; SI-NEXT: s_and_b32 s11, s31, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s50, 24 +; SI-NEXT: s_mov_b32 s4, s85 +; SI-NEXT: s_mov_b32 s85, s83 +; SI-NEXT: s_mov_b32 s83, s82 +; SI-NEXT: s_mov_b32 s82, s53 +; SI-NEXT: s_mov_b32 s53, s50 +; SI-NEXT: s_mov_b32 s50, s31 +; SI-NEXT: s_mov_b32 s31, s78 +; SI-NEXT: s_or_b32 s78, s15, s11 +; SI-NEXT: v_readlane_b32 s11, v62, 10 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_lshl_b32 s15, s17, 8 +; SI-NEXT: s_or_b32 s11, s11, s15 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: v_mov_b32_e32 v51, s9 +; SI-NEXT: s_or_b32 s6, s11, s9 +; SI-NEXT: v_readlane_b32 s9, v62, 3 +; SI-NEXT: v_readlane_b32 s11, v62, 2 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s15, s11, 8 +; SI-NEXT: s_or_b32 s9, s9, s15 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_or_b32 s17, s9, s14 +; SI-NEXT: v_readlane_b32 s9, v62, 22 +; SI-NEXT: v_mov_b32_e32 v52, s14 +; SI-NEXT: s_and_b32 s14, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 21 +; SI-NEXT: s_lshl_b32 s15, s9, 8 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v53, v6, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v50, s14, v53 +; SI-NEXT: s_and_b32 s14, s30, 0xff +; SI-NEXT: s_lshl_b32 s15, s88, 8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v54, v14, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v17, s14, v54 +; SI-NEXT: s_and_b32 s14, s8, 0xff +; SI-NEXT: s_lshl_b32 s15, s52, 8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v55, v40, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v16, s14, v55 +; SI-NEXT: s_and_b32 s14, s35, 0xff +; SI-NEXT: s_lshl_b32 s15, s28, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_or_b32_e32 v40, v42, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v15, s14, v40 +; SI-NEXT: s_and_b32 s14, s95, 0xff +; SI-NEXT: s_lshl_b32 s15, s66, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v41, v61, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v12, s14, v41 +; SI-NEXT: s_and_b32 s14, s18, 0xff +; SI-NEXT: s_lshl_b32 s15, s91, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v57 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v42, v60, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v11, s14, v42 +; SI-NEXT: s_and_b32 s14, s37, 0xff +; SI-NEXT: s_lshl_b32 s15, s19, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_mov_b32 s91, s6 +; SI-NEXT: v_or_b32_e32 v59, v31, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_readlane_b32 s6, v62, 50 +; SI-NEXT: v_or_b32_e32 v10, s14, v59 +; SI-NEXT: s_and_b32 s14, s6, 0xff +; SI-NEXT: s_lshl_b32 s15, s70, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v5, v24, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_readlane_b32 s6, v62, 19 +; SI-NEXT: v_or_b32_e32 v9, s14, v5 +; SI-NEXT: s_and_b32 s14, s90, 0xff +; SI-NEXT: s_lshl_b32 s15, s6, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v13, v25, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v8, s14, v13 +; SI-NEXT: s_and_b32 s14, s27, 0xff +; SI-NEXT: s_lshl_b32 s15, s24, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v26, v31 +; SI-NEXT: v_or_b32_e32 v31, v27, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v7, s14, v31 +; SI-NEXT: s_and_b32 s14, s49, 0xff +; SI-NEXT: s_lshl_b32 s15, s68, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v25, v60 +; SI-NEXT: v_or_b32_e32 v60, v43, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_readlane_b32 s6, v62, 17 +; SI-NEXT: v_or_b32_e32 v4, s14, v60 +; SI-NEXT: s_and_b32 s14, s6, 0xff +; SI-NEXT: v_readlane_b32 s6, v62, 16 +; SI-NEXT: s_lshl_b32 s15, s6, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v23, v27 +; SI-NEXT: v_mov_b32_e32 v27, v24 +; SI-NEXT: v_mov_b32_e32 v24, v61 +; SI-NEXT: v_or_b32_e32 v61, v44, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v2, s14, v61 +; SI-NEXT: s_and_b32 s14, s71, 0xff +; SI-NEXT: s_lshl_b32 s15, s36, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_readlane_b32 s6, v62, 48 +; SI-NEXT: v_or_b32_e32 v6, v45, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_and_b32 s8, s6, 0xffff +; SI-NEXT: v_readlane_b32 s6, v62, 49 +; SI-NEXT: v_or_b32_e32 v1, s14, v6 +; SI-NEXT: s_and_b32 s14, s79, 0xff +; SI-NEXT: s_lshl_b32 s15, s7, 8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: s_or_b32 s42, s8, s63 +; SI-NEXT: s_and_b32 s8, s6, 0xffff +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s40, s8, s21 +; SI-NEXT: s_and_b32 s8, s12, 0xffff +; SI-NEXT: v_or_b32_e32 v14, v46, v3 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s15, s8, s41 +; SI-NEXT: s_and_b32 s8, s13, 0xffff +; SI-NEXT: v_or_b32_e32 v3, s14, v14 +; SI-NEXT: s_or_b32 s14, s8, s43 +; SI-NEXT: s_and_b32 s8, s16, 0xffff +; SI-NEXT: s_and_b32 s16, s73, 0xffff +; SI-NEXT: s_or_b32 s13, s8, s44 +; SI-NEXT: s_and_b32 s8, s62, 0xffff +; SI-NEXT: s_or_b32 s35, s16, s56 +; SI-NEXT: s_and_b32 s16, s48, 0xffff +; SI-NEXT: s_or_b32 s12, s8, s45 +; SI-NEXT: s_and_b32 s8, s10, 0xffff +; SI-NEXT: s_or_b32 s52, s16, vcc_lo +; SI-NEXT: s_and_b32 s16, s72, 0xffff +; SI-NEXT: s_or_b32 s10, s8, s46 +; SI-NEXT: s_and_b32 s8, s94, 0xffff +; SI-NEXT: s_or_b32 s94, s16, vcc_hi +; SI-NEXT: s_and_b32 s16, s57, 0xffff +; SI-NEXT: s_or_b32 s49, s16, s74 +; SI-NEXT: s_and_b32 s16, s58, 0xffff +; SI-NEXT: s_or_b32 s48, s16, s75 +; SI-NEXT: s_and_b32 s16, s59, 0xffff +; SI-NEXT: s_mov_b32 s25, s23 +; SI-NEXT: s_or_b32 s11, s16, s76 +; SI-NEXT: s_and_b32 s16, s60, 0xffff +; SI-NEXT: s_and_b32 s23, s61, 0xffff +; SI-NEXT: s_mov_b32 s87, s34 +; SI-NEXT: s_mov_b32 s34, s55 +; SI-NEXT: s_mov_b32 s55, s22 +; SI-NEXT: s_or_b32 s8, s8, s47 +; SI-NEXT: s_or_b32 s9, s16, s77 +; SI-NEXT: s_or_b32 s16, s23, s78 +; SI-NEXT: s_mov_b32 s22, s18 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: v_mov_b32_e32 v18, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v44 +; SI-NEXT: v_mov_b32_e32 v35, v45 +; SI-NEXT: v_mov_b32_e32 v22, v46 +; SI-NEXT: v_alignbit_b32 v57, s42, v51, 16 +; SI-NEXT: v_alignbit_b32 v58, s40, v52, 16 +; SI-NEXT: v_alignbit_b32 v56, s15, v53, 16 +; SI-NEXT: v_alignbit_b32 v47, s14, v54, 16 +; SI-NEXT: v_alignbit_b32 v46, s13, v55, 16 +; SI-NEXT: v_alignbit_b32 v45, s12, v40, 16 +; SI-NEXT: v_alignbit_b32 v44, s10, v41, 16 +; SI-NEXT: v_alignbit_b32 v43, s8, v42, 16 +; SI-NEXT: v_alignbit_b32 v42, s35, v59, 16 +; SI-NEXT: v_alignbit_b32 v41, s52, v5, 16 +; SI-NEXT: v_alignbit_b32 v40, s94, v13, 16 +; SI-NEXT: v_alignbit_b32 v55, s49, v31, 16 +; SI-NEXT: v_mov_b32_e32 v31, v26 +; SI-NEXT: v_alignbit_b32 v54, s48, v60, 16 +; SI-NEXT: v_mov_b32_e32 v60, v25 +; SI-NEXT: v_mov_b32_e32 v25, v28 +; SI-NEXT: v_alignbit_b32 v53, s11, v61, 16 +; SI-NEXT: v_mov_b32_e32 v61, v24 +; SI-NEXT: v_mov_b32_e32 v24, v27 +; SI-NEXT: v_alignbit_b32 v52, s9, v6, 16 +; SI-NEXT: v_alignbit_b32 v51, s16, v14, 16 +; SI-NEXT: s_lshr_b32 s73, s63, 16 +; SI-NEXT: s_lshr_b32 s72, s21, 16 +; SI-NEXT: s_lshr_b32 s63, s41, 16 +; SI-NEXT: s_lshr_b32 s62, s43, 16 +; SI-NEXT: s_lshr_b32 s61, s44, 16 +; SI-NEXT: s_lshr_b32 s60, s45, 16 +; SI-NEXT: s_lshr_b32 s59, s46, 16 +; SI-NEXT: s_lshr_b32 s58, s47, 16 +; SI-NEXT: s_lshr_b32 s57, s56, 16 +; SI-NEXT: s_lshr_b32 s56, vcc_lo, 16 +; SI-NEXT: s_lshr_b32 s47, vcc_hi, 16 +; SI-NEXT: s_lshr_b32 s46, s74, 16 +; SI-NEXT: s_mov_b32 s74, s51 +; SI-NEXT: s_mov_b32 s51, s67 +; SI-NEXT: v_readlane_b32 s67, v62, 51 +; SI-NEXT: s_lshr_b32 s45, s75, 16 +; SI-NEXT: s_mov_b32 s23, s25 +; SI-NEXT: s_mov_b32 s21, s55 +; SI-NEXT: s_mov_b32 s55, s34 +; SI-NEXT: s_mov_b32 s75, s54 +; SI-NEXT: s_mov_b32 s34, s87 +; SI-NEXT: v_readlane_b32 s25, v62, 53 +; SI-NEXT: v_readlane_b32 s87, v62, 52 +; SI-NEXT: s_lshr_b32 s44, s76, 16 +; SI-NEXT: v_readlane_b32 s54, v62, 50 +; SI-NEXT: s_lshr_b32 s43, s77, 16 +; SI-NEXT: s_mov_b32 s76, s69 +; SI-NEXT: s_mov_b32 s69, s95 +; SI-NEXT: s_mov_b32 s77, s84 +; SI-NEXT: s_mov_b32 s84, s99 +; SI-NEXT: s_mov_b32 s99, s93 +; SI-NEXT: s_lshr_b32 s41, s78, 16 +; SI-NEXT: s_mov_b32 s78, s31 +; SI-NEXT: s_mov_b32 s31, s50 +; SI-NEXT: s_mov_b32 s50, s53 +; SI-NEXT: s_mov_b32 s53, s82 +; SI-NEXT: s_mov_b32 s82, s83 +; SI-NEXT: s_mov_b32 s83, s85 +; SI-NEXT: s_mov_b32 s85, s4 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v6, v20 +; SI-NEXT: v_mov_b32_e32 v13, v19 +; SI-NEXT: v_mov_b32_e32 v14, v29 +; SI-NEXT: s_branch .LBB97_3 +; SI-NEXT: .LBB97_2: +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v46 +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v45 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v44 +; SI-NEXT: v_mov_b32_e32 v18, v43 +; SI-NEXT: v_mov_b32_e32 v23, v27 +; SI-NEXT: v_mov_b32_e32 v21, v29 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: .LBB97_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v5, v39 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB97_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_add_i32 s4, s79, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_add_i32 s4, s82, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s53, 8 +; SI-NEXT: s_add_i32 s8, s31, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s50, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_add_i32 s71, s71, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s71, 0xff +; SI-NEXT: s_lshl_b32 s8, s36, 8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v48 +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_readlane_b32 s7, v62, 12 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: s_add_i32 s36, s86, 3 +; SI-NEXT: s_lshl_b32 s8, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v62, 11 +; SI-NEXT: v_or_b32_e32 v2, s5, v2 +; SI-NEXT: s_and_b32 s5, s36, 0xff +; SI-NEXT: s_add_i32 s9, s7, 3 +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s8, s78, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_add_i32 s16, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 17 +; SI-NEXT: s_add_i32 s9, s5, 0x3000000 +; SI-NEXT: s_add_i32 s79, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 16 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v2 +; SI-NEXT: s_and_b32 s4, s79, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v5 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_add_i32 s4, s77, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 15 +; SI-NEXT: v_readlane_b32 s6, v62, 14 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s8, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 13 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s11, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 32 +; SI-NEXT: s_add_i32 s53, s4, 3 +; SI-NEXT: s_and_b32 s4, s53, 0xff +; SI-NEXT: s_lshl_b32 s5, s68, 8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v38 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_add_i32 s93, s98, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 18 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s93, 0xff +; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: s_add_i32 s8, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s81, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s86, s27, 3 +; SI-NEXT: s_add_i32 s48, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s86, 0xff +; SI-NEXT: s_lshl_b32 s5, s24, 8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v37 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v5, v23, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_readlane_b32 s4, v62, 38 +; SI-NEXT: s_add_i32 s68, s4, 3 +; SI-NEXT: s_and_b32 s4, s68, 0xff +; SI-NEXT: s_lshl_b32 s5, s97, 8 +; SI-NEXT: s_add_i32 s8, s67, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s51, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s50, s90, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 19 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v5 +; SI-NEXT: s_add_i32 s49, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s50, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v5, v25, v5 +; SI-NEXT: s_add_i32 s94, s20, 3 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s94, 0xff +; SI-NEXT: s_lshl_b32 s5, s85, 8 +; SI-NEXT: s_add_i32 s8, s83, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s84, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s54, 3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v5 +; SI-NEXT: s_add_i32 s94, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s70, 8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v49 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v5, v24, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s98, s38, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 20 +; SI-NEXT: s_and_b32 s4, s98, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s8, s92, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s17, s37, 3 +; SI-NEXT: s_add_i32 s52, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_readlane_b32 s5, v62, 34 +; SI-NEXT: v_readlane_b32 s6, v62, 29 +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s38, s6, 3 +; SI-NEXT: s_and_b32 s8, s38, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s85, s25, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 40 +; SI-NEXT: s_add_i32 s70, s6, 3 +; SI-NEXT: s_and_b32 s7, s70, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s19, s69, 3 +; SI-NEXT: s_add_i32 s51, s30, 3 +; SI-NEXT: s_add_i32 s95, s89, 3 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_mov_b32_e32 v39, s9 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; SI-NEXT: v_mov_b32_e32 v28, s11 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v4 +; SI-NEXT: v_mov_b32_e32 v27, s48 +; SI-NEXT: v_mov_b32_e32 v26, s49 +; SI-NEXT: v_mov_b32_e32 v25, s94 +; SI-NEXT: v_mov_b32_e32 v24, s52 +; SI-NEXT: v_alignbit_b32 v41, v24, v9, 16 +; SI-NEXT: v_alignbit_b32 v40, v25, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v26, v7, 16 +; SI-NEXT: v_alignbit_b32 v54, v27, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v28, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v39, v1, 16 +; SI-NEXT: v_alignbit_b32 v51, v30, v3, 16 +; SI-NEXT: s_lshr_b32 s56, s52, 16 +; SI-NEXT: s_lshr_b32 s47, s94, 16 +; SI-NEXT: s_lshr_b32 s46, s49, 16 +; SI-NEXT: s_lshr_b32 s45, s48, 16 +; SI-NEXT: s_lshr_b32 s44, s11, 16 +; SI-NEXT: s_lshr_b32 s43, s9, 16 +; SI-NEXT: s_lshr_b32 s41, s16, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v31, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s4, v62, 37 +; SI-NEXT: s_add_i32 s67, s4, 3 +; SI-NEXT: s_and_b32 s4, s67, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 30 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 46 +; SI-NEXT: s_add_i32 s35, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: v_mov_b32_e32 v23, s35 +; SI-NEXT: v_alignbit_b32 v42, v23, v10, 16 +; SI-NEXT: s_lshr_b32 s57, s35, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s85, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 36 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 47 +; SI-NEXT: s_add_i32 s8, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_readlane_b32 s5, v62, 44 +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_mov_b32_e32 v22, s8 +; SI-NEXT: v_alignbit_b32 v43, v22, v11, 16 +; SI-NEXT: s_lshr_b32 s58, s8, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s4, v62, 45 +; SI-NEXT: s_add_i32 s6, s4, 3 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: v_readlane_b32 s6, v62, 27 +; SI-NEXT: s_add_i32 s34, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 28 +; SI-NEXT: s_and_b32 s6, s34, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s10, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 43 +; SI-NEXT: s_add_i32 s97, s4, 3 +; SI-NEXT: s_and_b32 s4, s97, 0xff +; SI-NEXT: s_lshl_b32 s5, s28, 8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_readlane_b32 s5, v62, 41 +; SI-NEXT: v_readlane_b32 s6, v62, 33 +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s54, s6, 3 +; SI-NEXT: s_and_b32 s6, s54, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: v_alignbit_b32 v44, v20, v12, 16 +; SI-NEXT: s_lshr_b32 s59, s10, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v14, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s4, v62, 42 +; SI-NEXT: s_add_i32 s81, s4, 3 +; SI-NEXT: s_and_b32 s4, s81, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 31 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s12, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 39 +; SI-NEXT: s_add_i32 s69, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 35 +; SI-NEXT: s_and_b32 s4, s69, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_readlane_b32 s5, v62, 25 +; SI-NEXT: v_readlane_b32 s6, v62, 24 +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s92, s6, 3 +; SI-NEXT: s_and_b32 s6, s92, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_mov_b32_e32 v19, s12 +; SI-NEXT: v_alignbit_b32 v45, v19, v15, 16 +; SI-NEXT: s_lshr_b32 s60, s12, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v13, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_readlane_b32 s4, v62, 26 +; SI-NEXT: s_add_i32 s31, s4, 3 +; SI-NEXT: s_and_b32 s4, s31, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s87, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s13, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s51, 0xff +; SI-NEXT: s_lshl_b32 s5, s88, 8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s99, 8 +; SI-NEXT: s_add_i32 s6, s96, 3 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v18, s13 +; SI-NEXT: v_alignbit_b32 v46, v18, v16, 16 +; SI-NEXT: s_lshr_b32 s61, s13, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s95, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s76, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s14, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 22 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 21 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: s_add_i32 s6, s55, 3 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v5 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_alignbit_b32 v47, v5, v17, 16 +; SI-NEXT: s_lshr_b32 s62, s14, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v6, v13 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_add_i32 s4, s80, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s64, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s15, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 2 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 1 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s17, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_add_i32 s6, s74, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s21, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s40, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 10 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 23 +; SI-NEXT: v_readlane_b32 s6, v62, 9 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 8 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s91, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 7 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 6 +; SI-NEXT: v_readlane_b32 s6, v62, 5 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 4 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s42, s4, 0x3000000 +; SI-NEXT: v_mov_b32_e32 v13, s91 +; SI-NEXT: v_add_i32_e32 v50, vcc, 0x3000000, v6 +; SI-NEXT: v_mov_b32_e32 v6, s15 +; SI-NEXT: v_alignbit_b32 v57, s42, v13, 16 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_alignbit_b32 v58, s40, v13, 16 +; SI-NEXT: v_alignbit_b32 v56, v6, v50, 16 +; SI-NEXT: s_lshr_b32 s73, s42, 16 +; SI-NEXT: s_lshr_b32 s72, s40, 16 +; SI-NEXT: s_lshr_b32 s63, s15, 16 +; SI-NEXT: .LBB97_5: ; %end +; SI-NEXT: s_and_b32 s4, s91, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s42, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v58 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v56 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v45 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v43 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v42 +; SI-NEXT: s_and_b32 s4, s35, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v41 +; SI-NEXT: s_and_b32 s4, s52, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v40 +; SI-NEXT: s_and_b32 s4, s94, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v55 +; SI-NEXT: s_and_b32 s4, s49, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v54 +; SI-NEXT: s_and_b32 s4, s48, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v128i8_to_v64i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v46, v16 +; VI-NEXT: v_mov_b32_e32 v60, v5 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:56 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_mov_b32_e32 v62, v21 +; VI-NEXT: v_mov_b32_e32 v47, v17 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v18 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:88 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v19 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:204 +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v22 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v2 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:240 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:220 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:252 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:268 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:284 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:280 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:312 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:296 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:264 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB97_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v60 +; VI-NEXT: v_mov_b32_e32 v28, v26 +; VI-NEXT: v_mov_b32_e32 v26, v23 +; VI-NEXT: v_mov_b32_e32 v23, v5 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v31, v22 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v59, v10 +; VI-NEXT: v_mov_b32_e32 v58, v43 +; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_mov_b32_e32 v27, v14 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v3, v38 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v62 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v2, v63 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v33, v36 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v15 +; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v49 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v48, v32 +; VI-NEXT: v_mov_b32_e32 v34, v40 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v57 +; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v25, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v60 +; VI-NEXT: v_mov_b32_e32 v57, v61 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v61, v38 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v56 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v56, v45 +; VI-NEXT: v_mov_b32_e32 v51, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v53 +; VI-NEXT: v_mov_b32_e32 v55, v63 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v42, v44 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v46 +; VI-NEXT: v_mov_b32_e32 v39, v41 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v41, v52 +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v63, v54 +; VI-NEXT: v_mov_b32_e32 v54, v49 +; VI-NEXT: v_mov_b32_e32 v49, v53 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB97_3 +; VI-NEXT: .LBB97_2: +; VI-NEXT: v_mov_b32_e32 v34, v40 +; VI-NEXT: v_mov_b32_e32 v57, v61 +; VI-NEXT: v_mov_b32_e32 v48, v32 +; VI-NEXT: v_mov_b32_e32 v56, v45 +; VI-NEXT: v_mov_b32_e32 v51, v42 +; VI-NEXT: v_mov_b32_e32 v39, v41 +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v33, v36 +; VI-NEXT: v_mov_b32_e32 v36, v49 +; VI-NEXT: v_mov_b32_e32 v35, v63 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v53 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: v_mov_b32_e32 v52, v38 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: .LBB97_3: ; %Flow +; VI-NEXT: v_mov_b32_e32 v38, v48 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB97_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v47 +; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v53, v34 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s25, 8 +; VI-NEXT: s_and_b32 s10, s24, 0xff +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_and_b32 s12, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 24 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_and_b32 s11, s22, 0xff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s12 +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v39 +; VI-NEXT: s_and_b32 s10, s26, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s8, 0xffff +; VI-NEXT: s_lshl_b32 s8, s11, 16 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: v_and_b32_e32 v39, 0xff, v39 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s8, s10, 16 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v31, 24, v63 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s9, 0xffff +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v25 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x300, v28 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x300, v21 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v20 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v19 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v23 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v15 +; VI-NEXT: v_lshlrev_b32_e32 v33, 24, v59 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x300, v13 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v12 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v11 +; VI-NEXT: v_add_u32_e32 v43, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v24 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x300, v26 +; VI-NEXT: v_and_b32_e32 v26, 0xff, v43 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_or_b32_e32 v26, v33, v26 +; VI-NEXT: v_or_b32_sdwa v26, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 +; VI-NEXT: v_add_u32_e32 v44, vcc, 3, v51 +; VI-NEXT: v_and_b32_e32 v27, 0xff, v44 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v9 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v46 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x300, v7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x300, v6 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x300, v5 +; VI-NEXT: v_lshlrev_b32_e32 v32, 24, v49 +; VI-NEXT: v_or_b32_e32 v27, v32, v27 +; VI-NEXT: v_or_b32_sdwa v27, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v61 +; VI-NEXT: v_or_b32_e32 v4, v4, v29 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v29, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_e32 v3, s4, v4 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v29 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v5, v29, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v6, v29, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v8, v29, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v9, v29, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v10, v29, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v35 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_or_b32_sdwa v11, v29, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v52 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v12, v29, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v35, 24, v56 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v37 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v41 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v14, v29, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v2 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v62 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v55 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v36 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v29, v29, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v29 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v18, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v53 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v19, v29, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v57 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_or_b32_sdwa v20, v29, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v38 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v21, v29, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v22 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: v_or_b32_sdwa v55, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 24, v54 +; VI-NEXT: v_lshlrev_b32_e32 v54, 24, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v55 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v40, 24, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v30, 24, v28 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v41, vcc, 3, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 +; VI-NEXT: v_and_b32_e32 v41, 0xff, v41 +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 +; VI-NEXT: v_or_b32_e32 v55, v40, v55 +; VI-NEXT: v_or_b32_sdwa v23, v55, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v42, vcc, 3, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v42, 0xff, v42 +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v42 +; VI-NEXT: v_or_b32_e32 v54, v54, v55 +; VI-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v53, vcc, 3, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v53, 0xff, v53 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_or_b32_e32 v35, v35, v53 +; VI-NEXT: v_or_b32_sdwa v25, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v52, vcc, 3, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v52, 0xff, v52 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; VI-NEXT: v_or_b32_e32 v28, v28, v32 +; VI-NEXT: v_or_b32_sdwa v28, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v51, vcc, 3, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_and_b32_e32 v51, 0xff, v51 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; VI-NEXT: v_or_b32_e32 v29, v29, v32 +; VI-NEXT: v_or_b32_sdwa v29, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v49, vcc, 3, v34 +; VI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; VI-NEXT: v_or_b32_e32 v30, v30, v32 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v39 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_e32 v31, v31, v32 +; VI-NEXT: v_or_b32_sdwa v30, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 +; VI-NEXT: .LBB97_5: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v128i8_to_v64i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v44 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(36) +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB97_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v39, v16 +; GFX9-NEXT: v_or_b32_sdwa v17, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v42, v61 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v55, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v50, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v49, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_mov_b32_e32 v46, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v35, v45 +; GFX9-NEXT: v_mov_b32_e32 v45, v61 +; GFX9-NEXT: v_mov_b32_e32 v61, v42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v38, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v54, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB97_3 +; GFX9-NEXT: .LBB97_2: +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v33, v45 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: .LBB97_3: ; %Flow +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB97_5 +; GFX9-NEXT: ; %bb.4: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_lshl_b32 s6, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshl_b32 s9, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_lshl_b32 s10, s19, 8 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v25, v37, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v37, v51, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v38, v38, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s16, 0xff +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_and_b32 s9, s18, 0xff +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v39, v36, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v51, v34, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v2 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v52, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 +; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v56 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v23, v41, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v41, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v22 +; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v52 +; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v42, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v45, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v23 +; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v25 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 +; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v38 +; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v50 +; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v39 +; GFX9-NEXT: v_add_u32_e32 v39, 0x300, v49 +; GFX9-NEXT: v_add_u32_e32 v49, 0x300, v53 +; GFX9-NEXT: v_add_u32_e32 v50, 0x300, v55 +; GFX9-NEXT: v_add_u32_e32 v53, 0x300, v45 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v37, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v36, 16, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 +; GFX9-NEXT: .LBB97_5: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:436 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:192 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64 +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB97_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v2, 0xffff, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v68 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v3, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v83 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v98 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v148 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v20, 16, v21 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v147 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v149 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v151 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v161 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v163 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v165 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v167 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v44 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v56 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v59 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v63 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v77 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v75 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v78 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v79 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v91 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v92 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB97_3 +; GFX11-TRUE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v92 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v88 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v78 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v77 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v76 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v75 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v74 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v72 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v63 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v61 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v59 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v59, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v60, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v57 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v58 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v57, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v56 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v56, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v45 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v46 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v45, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v44 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v43 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v41 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v42 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v40 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v183 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v180 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v177 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v167 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v167, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v176, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v165 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v166 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v165, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v163 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v161 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v162 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v160 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v151 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v149 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v148 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v98 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v55, 3, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v97 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v86 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v81 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v69 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v70, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v69, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v64, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 3, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v53, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v49, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v50 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v38, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v36, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v37, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v34, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v53, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v49, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v69 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v165, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v161, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v15, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v14, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v59, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v35 +; GFX11-TRUE16-NEXT: .LBB97_3: ; %end +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:436 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB97_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB97_2 +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:320 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:516 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456 -; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:392 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v30 :: v_dual_mov_b32 v54, v24 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v55, v28 :: v_dual_mov_b32 v52, v26 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v22 :: v_dual_mov_b32 v53, v20 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v51, v16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v14 :: v_dual_mov_b32 v49, v12 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v39, v4 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v150, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:240 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:20 -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v29 +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v40 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v45 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v46 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v56 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v44, 8, v60 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v61 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v62 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v72 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v73 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v74 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v75 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v76 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v77 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v78 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v79 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v89 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v90 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v76, 8, v95 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v104 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v105 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v94 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v28 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB97_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v5, 0xffff, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v4, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v51 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v49 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v132 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v161 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v167 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v7, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v9, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v11, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v130 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v150 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v40 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v43 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v46 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v57 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v56 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v10, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v12, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v14, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v16, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v119 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v149 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v144 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v146 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v148 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v58 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v44 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v60 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v47 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v72 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v74 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v73 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v13, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v15, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v17, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v19, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v21, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v180 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v75 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v61 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v78 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v77 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v79 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v76 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v90 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v89 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v92 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v91 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v20, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v22, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v104 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v95 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v105 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v94 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v108 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v107 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v110 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v109 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v23, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v29, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v31, v30, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v99 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v129 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v116 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v111 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v106 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v122 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v121 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v123 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v120 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v125 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v124 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v126 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v127 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v30, v29, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v32, v31, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v34, v33, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v36, v35, 0x5040100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB97_3 +; GFX11-FAKE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v58 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v43 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v40 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v90, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v89, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v88, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v78, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v54 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v39 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 3, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v101, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v102, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v87, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v86, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v84, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v83, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v82, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v81, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v80, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 3, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v69, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v112, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v68, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v67, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v65, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v64, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 +; GFX11-FAKE16-NEXT: .LBB97_3: ; %end +; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:440 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB97_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB97_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v64i16_to_v128i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v47 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v46, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v45, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v44, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v43, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v42, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v41, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v40, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v55, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v54, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v53, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v52, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v51, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v50, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v49, v2, v27 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v48, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v39, v2, v19 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v38, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v37, v2, v15 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v36, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v35, v2, v29 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v34, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v33, v2, v21 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v31, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v32, v2, v17 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v26, v2, v25 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v30, v2, v13 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v18, v2, v11 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v22, v2, v9 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v14, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v10, v2, v5 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v6, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v4, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v16, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v20, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v24, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v28, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v60, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v58, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v59, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v57, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v63, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v56, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v61, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v62, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v47, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v45, v46, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v45, v46, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v45, v46, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v43, v44, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v43, v44, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v43, v44, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v42, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v42, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v42, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v55, v40, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v55, v40, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v55, v40, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v51, v52, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v51, v52, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v51, v52, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v49, v50, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v49, v50, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v49, v50, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v35, v36, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v35, v36, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v35, v36, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v26, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v26, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v26, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v22, v18, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v22, v18, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v22, v18, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v10, v14, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v10, v14, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v10, v14, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v2, v6, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v2, v6, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v2, v6, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB98_2: ; %Flow +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v12, v2, v6, 24 +; SI-NEXT: v_alignbit_b32 v20, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v47, v2, v6, 8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v62, v22, v18, 24 +; SI-NEXT: v_alignbit_b32 v63, v22, v18, 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v4 +; SI-NEXT: v_alignbit_b32 v56, v10, v14, 24 +; SI-NEXT: v_alignbit_b32 v57, v10, v14, 16 +; SI-NEXT: v_alignbit_b32 v61, v10, v14, 8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v1 +; SI-NEXT: v_alignbit_b32 v1, v45, v46, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v45, v46, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v43, v44, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v43, v44, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v43, v44, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v42, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v42, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v42, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v55, v40, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v55, v40, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v55, v40, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v51, v52, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v51, v52, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v51, v52, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v49, v50, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v49, v50, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v49, v50, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v35, v36, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v35, v36, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v35, v36, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v26, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v26, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v26, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v22, v18, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; SI-NEXT: v_alignbit_b32 v4, v45, v46, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: .LBB98_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v45 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v51 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i16_to_v128i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v30 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB98_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v16 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v14 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v13 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v12 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v11 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v10 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v10 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v9 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v8 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[15:16] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[13:14] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[11:12] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[9:10] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v31, v7 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v10 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v11 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v12 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v13 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v14 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v16 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v8 +; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v7, v6 +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v4 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v36 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v37 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v22 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v20 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v18 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v46 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v26 +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v24 +; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v22 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v20 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v17 +; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: .LBB98_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB98_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v31, 3 +; VI-NEXT: v_add_u16_sdwa v51, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v32, 3, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v18, v32, v18 +; VI-NEXT: v_add_u16_e32 v32, 3, v17 +; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v17, v32, v17 +; VI-NEXT: v_add_u16_e32 v32, 3, v20 +; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v20, v32, v20 +; VI-NEXT: v_add_u16_e32 v32, 3, v19 +; VI-NEXT: v_add_u16_sdwa v19, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_add_u16_sdwa v48, v22, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v19, v32, v19 +; VI-NEXT: v_add_u16_e32 v32, 3, v22 +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; VI-NEXT: v_add_u16_sdwa v53, v21, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v22, v32, v22 +; VI-NEXT: v_add_u16_e32 v32, 3, v21 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; VI-NEXT: v_add_u16_sdwa v61, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v21, v32, v21 +; VI-NEXT: v_add_u16_e32 v32, 3, v24 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v61 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v24, v32, v24 +; VI-NEXT: v_add_u16_e32 v32, 3, v23 +; VI-NEXT: v_add_u16_sdwa v23, v23, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_add_u16_sdwa v58, v26, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v23, v32, v23 +; VI-NEXT: v_add_u16_e32 v32, 3, v26 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v26, v32, v26 +; VI-NEXT: v_add_u16_e32 v32, 3, v25 +; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_add_u16_sdwa v39, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v25, v32, v25 +; VI-NEXT: v_add_u16_e32 v32, 3, v28 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v28, v32, v28 +; VI-NEXT: v_add_u16_e32 v32, 3, v27 +; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_add_u16_sdwa v35, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v27, v32, v27 +; VI-NEXT: v_add_u16_e32 v33, 3, v30 +; VI-NEXT: v_add_u16_e32 v34, 3, v29 +; VI-NEXT: v_add_u16_sdwa v32, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v35 +; VI-NEXT: v_add_u16_sdwa v52, v37, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v30, v33, v29 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 +; VI-NEXT: v_add_u16_e32 v33, 3, v37 +; VI-NEXT: v_add_u16_sdwa v50, v36, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v29, v34, v29 +; VI-NEXT: v_add_u16_e32 v34, 3, v36 +; VI-NEXT: v_or_b32_e32 v37, v33, v32 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v50 +; VI-NEXT: v_add_u16_sdwa v57, v2, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v36, v34, v32 +; VI-NEXT: v_add_u16_e32 v33, 3, v2 +; VI-NEXT: v_add_u16_e32 v34, 3, v1 +; VI-NEXT: v_add_u16_sdwa v32, v1, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; VI-NEXT: v_or_b32_e32 v2, v33, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_add_u16_sdwa v56, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v1, v34, v1 +; VI-NEXT: v_add_u16_e32 v33, 3, v4 +; VI-NEXT: v_add_u16_e32 v34, 3, v3 +; VI-NEXT: v_add_u16_sdwa v32, v3, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 +; VI-NEXT: v_or_b32_e32 v4, v33, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; VI-NEXT: v_add_u16_sdwa v47, v6, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v3, v34, v3 +; VI-NEXT: v_add_u16_e32 v33, 3, v6 +; VI-NEXT: v_add_u16_e32 v34, 3, v5 +; VI-NEXT: v_add_u16_sdwa v32, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; VI-NEXT: v_or_b32_e32 v6, v33, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v5, v34, v5 +; VI-NEXT: v_add_u16_sdwa v34, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v38, 3, v8 +; VI-NEXT: v_add_u16_e32 v33, 3, v7 +; VI-NEXT: v_add_u16_sdwa v32, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; VI-NEXT: v_or_b32_e32 v8, v38, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; VI-NEXT: v_add_u16_sdwa v59, v10, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v7, v33, v7 +; VI-NEXT: v_add_u16_e32 v33, 3, v10 +; VI-NEXT: v_add_u16_e32 v38, 3, v9 +; VI-NEXT: v_add_u16_sdwa v32, v9, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 +; VI-NEXT: v_or_b32_e32 v10, v33, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 +; VI-NEXT: v_add_u16_sdwa v63, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v9, v38, v9 +; VI-NEXT: v_add_u16_e32 v33, 3, v12 +; VI-NEXT: v_add_u16_e32 v38, 3, v11 +; VI-NEXT: v_add_u16_sdwa v32, v11, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v12, v33, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; VI-NEXT: v_add_u16_sdwa v33, v14, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v11, v38, v11 +; VI-NEXT: v_add_u16_e32 v38, 3, v14 +; VI-NEXT: v_add_u16_e32 v49, 3, v13 +; VI-NEXT: v_add_u16_sdwa v32, v13, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 +; VI-NEXT: v_add_u16_sdwa v60, v16, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v38, v13 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; VI-NEXT: v_add_u16_sdwa v31, v15, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_add_u16_e32 v32, 3, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v16, v16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v15, v32, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[15:16] +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v13, v49, v13 +; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v14 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v13 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[13:14] +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v9 +; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v60, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v33, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v63, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v59, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v34, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v47, 8, 8 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v56, 8, 8 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v57, 8, 8 +; VI-NEXT: v_mov_b32_e32 v46, v35 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v52, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v46, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v39, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v49, v53 +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v52, v51 +; VI-NEXT: v_bfe_u32 v31, v51, 8, 8 +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v17 +; VI-NEXT: v_bfe_u32 v35, v58, 8, 8 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v39, v61, 8, 8 +; VI-NEXT: v_bfe_u32 v58, v48, 8, 8 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_bfe_u32 v61, v53, 8, 8 +; VI-NEXT: .LBB98_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v31 +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v41 +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v58 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i16_to_v128i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB98_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(45) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(46) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; GFX9-NEXT: .LBB98_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB98_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX9-NEXT: .LBB98_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v49 +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v42 +; GFX9-NEXT: v_or_b32_sdwa v34, v58, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v64i16_to_v128i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 +; GFX11-TRUE16-NEXT: .LBB98_2: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_4 +; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 +; GFX11-TRUE16-NEXT: .LBB98_4: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v1.h, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v69 +; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v151.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v51, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v68 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v51, v54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v51, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v131.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v51, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v12.h, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v65, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v145.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v16 +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v18 +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20 +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v24, v25 +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v116.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v26, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v67 +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v38, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v35, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v29, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v33, v24 +; GFX11-TRUE16-NEXT: s_clause 0x5 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 -; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v131, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v116, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v35, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v126, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v127, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v125, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v124, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v33, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v98, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v116, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v98, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v112, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v99, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v103, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v120, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v122, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v121, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v111, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v81, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v81, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v101, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v97, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v106, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v110, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v109, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v108, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v94, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] +; GFX11-FAKE16-NEXT: .LBB98_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v17 +; GFX11-FAKE16-NEXT: .LBB98_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v74 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v64 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v60 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v47 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v67, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v62 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v64, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v64, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v151 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v117 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v64, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v97 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v82 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v73 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v39, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v46 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v41 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v180 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v166 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v38, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v132 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v36, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v113 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v87 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v85 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v54, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v52, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v53, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v50, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v49, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v49, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v48, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v34, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v32, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v71 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v33, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: s_clause 0x5 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i16_to_v128i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_writelane_b32 v63, s36, 4 +; SI-NEXT: v_writelane_b32 v63, s37, 5 +; SI-NEXT: v_writelane_b32 v63, s38, 6 +; SI-NEXT: v_writelane_b32 v63, s39, 7 +; SI-NEXT: v_writelane_b32 v63, s48, 8 +; SI-NEXT: v_writelane_b32 v63, s49, 9 +; SI-NEXT: v_writelane_b32 v63, s50, 10 +; SI-NEXT: v_writelane_b32 v63, s51, 11 +; SI-NEXT: v_writelane_b32 v63, s52, 12 +; SI-NEXT: v_writelane_b32 v63, s53, 13 +; SI-NEXT: v_writelane_b32 v63, s54, 14 +; SI-NEXT: v_writelane_b32 v63, s55, 15 +; SI-NEXT: v_writelane_b32 v63, s64, 16 +; SI-NEXT: v_writelane_b32 v63, s65, 17 +; SI-NEXT: v_writelane_b32 v63, s66, 18 +; SI-NEXT: v_writelane_b32 v63, s67, 19 +; SI-NEXT: v_writelane_b32 v63, s68, 20 +; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: s_mov_b32 s6, s18 +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s62, v30 +; SI-NEXT: v_readfirstlane_b32 s63, v29 +; SI-NEXT: v_readfirstlane_b32 s59, v26 +; SI-NEXT: v_readfirstlane_b32 s60, v25 +; SI-NEXT: v_readfirstlane_b32 s98, v22 +; SI-NEXT: v_readfirstlane_b32 s61, v21 +; SI-NEXT: v_readfirstlane_b32 s99, v18 +; SI-NEXT: v_readfirstlane_b32 s58, v17 +; SI-NEXT: v_readfirstlane_b32 s96, v14 +; SI-NEXT: v_readfirstlane_b32 s97, v13 +; SI-NEXT: v_readfirstlane_b32 s86, v10 +; SI-NEXT: v_readfirstlane_b32 s87, v9 +; SI-NEXT: v_readfirstlane_b32 s84, v6 +; SI-NEXT: v_readfirstlane_b32 s85, v5 +; SI-NEXT: v_readfirstlane_b32 s81, v2 +; SI-NEXT: v_readfirstlane_b32 s82, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s88, v36 +; SI-NEXT: v_readfirstlane_b32 s18, v37 +; SI-NEXT: v_readfirstlane_b32 s78, v38 +; SI-NEXT: v_readfirstlane_b32 s79, v39 +; SI-NEXT: v_readfirstlane_b32 s76, v48 +; SI-NEXT: v_readfirstlane_b32 s77, v49 +; SI-NEXT: v_readfirstlane_b32 s74, v50 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s75, v51 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s72, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s73, v53 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v42 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_writelane_b32 v62, s6, 0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB99_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s56, s4, s5 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s57, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s56 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: v_alignbit_b32 v8, s57, v1, 24 +; SI-NEXT: v_alignbit_b32 v50, s57, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, s57, v1, 8 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s46 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, s47, v1, 24 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, s47, v1, 16 +; SI-NEXT: v_alignbit_b32 v51, s47, v1, 8 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s44 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, s45, v1, 24 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s82, 0xffff +; SI-NEXT: s_lshl_b32 s5, s81, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, s45, v1, 16 +; SI-NEXT: v_alignbit_b32 v49, s45, v1, 8 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s42 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, s43, v1, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, s43, v1, 16 +; SI-NEXT: v_alignbit_b32 v48, s43, v1, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: s_and_b32 s4, s85, 0xffff +; SI-NEXT: s_lshl_b32 s5, s84, 16 +; SI-NEXT: v_or_b32_e32 v16, v1, v2 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: v_alignbit_b32 v1, s41, v16, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, s41, v16, 16 +; SI-NEXT: s_and_b32 s4, s87, 0xffff +; SI-NEXT: s_lshl_b32 s5, s86, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, s41, v16, 8 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s97, 0xffff +; SI-NEXT: s_lshl_b32 s5, s96, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: s_or_b32 s15, s4, s5 +; SI-NEXT: s_and_b32 s4, s58, 0xffff +; SI-NEXT: s_lshl_b32 s5, s99, 16 +; SI-NEXT: v_or_b32_e32 v14, v1, v4 +; SI-NEXT: s_or_b32 s14, s4, s5 +; SI-NEXT: s_and_b32 s4, s61, 0xffff +; SI-NEXT: s_lshl_b32 s5, s98, 16 +; SI-NEXT: v_alignbit_b32 v1, s40, v14, 24 +; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_and_b32 s4, s60, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, s40, v14, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s63, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, s40, v14, 8 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s73, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s75, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 +; SI-NEXT: v_or_b32_e32 v12, v1, v5 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s77, 0xffff +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: v_alignbit_b32 v1, s15, v12, 24 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s79, 0xffff +; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, s15, v12, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s88, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, s15, v12, 8 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_lshr_b32 s4, s11, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_writelane_b32 v62, s4, 1 +; SI-NEXT: s_lshr_b32 s4, s10, 8 +; SI-NEXT: v_or_b32_e32 v10, v1, v6 +; SI-NEXT: v_writelane_b32 v62, s4, 3 +; SI-NEXT: s_lshr_b32 s4, s9, 8 +; SI-NEXT: v_alignbit_b32 v1, s14, v10, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 6 +; SI-NEXT: s_lshr_b32 s4, s8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, s14, v10, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 9 +; SI-NEXT: s_lshr_b32 s4, s7, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, s14, v10, 8 +; SI-NEXT: v_writelane_b32 v62, s4, 12 +; SI-NEXT: s_lshr_b32 s4, s6, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_writelane_b32 v62, s4, 15 +; SI-NEXT: s_and_b32 s4, s72, 0xffff +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v1, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_writelane_b32 v62, s4, 2 +; SI-NEXT: s_and_b32 s4, s74, 0xffff +; SI-NEXT: v_or_b32_e32 v5, v1, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_writelane_b32 v62, s4, 5 +; SI-NEXT: s_and_b32 s4, s76, 0xffff +; SI-NEXT: v_mov_b32_e32 v28, v13 +; SI-NEXT: v_or_b32_e32 v13, v1, v17 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_writelane_b32 v62, s4, 8 +; SI-NEXT: s_and_b32 s4, s78, 0xffff +; SI-NEXT: v_mov_b32_e32 v26, v9 +; SI-NEXT: v_or_b32_e32 v9, v1, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_writelane_b32 v62, s4, 11 +; SI-NEXT: s_and_b32 s4, s88, 0xffff +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_or_b32_e32 v6, v1, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: s_bfe_u32 s4, s74, 0x80008 +; SI-NEXT: v_or_b32_e32 v4, v1, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_writelane_b32 v62, s4, 4 +; SI-NEXT: s_bfe_u32 s4, s76, 0x80008 +; SI-NEXT: v_or_b32_e32 v2, v1, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_writelane_b32 v62, s4, 7 +; SI-NEXT: s_bfe_u32 s4, s78, 0x80008 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 +; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: s_bfe_u32 s4, s88, 0x80008 +; SI-NEXT: v_mov_b32_e32 v29, v17 +; SI-NEXT: v_mov_b32_e32 v30, v18 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v21 +; SI-NEXT: v_mov_b32_e32 v38, v22 +; SI-NEXT: v_mov_b32_e32 v39, v24 +; SI-NEXT: s_lshr_b32 s68, s57, 8 +; SI-NEXT: s_lshr_b32 s65, s47, 8 +; SI-NEXT: s_lshr_b32 s54, s45, 8 +; SI-NEXT: s_lshr_b32 s51, s43, 8 +; SI-NEXT: s_lshr_b32 s48, s41, 8 +; SI-NEXT: s_lshr_b32 s37, s40, 8 +; SI-NEXT: s_lshr_b32 s34, s15, 8 +; SI-NEXT: s_lshr_b32 s95, s14, 8 +; SI-NEXT: s_lshr_b32 s92, s13, 8 +; SI-NEXT: s_lshr_b32 s89, s12, 8 +; SI-NEXT: s_and_b32 s71, s19, 0xffff +; SI-NEXT: s_and_b32 s69, s23, 0xffff +; SI-NEXT: s_and_b32 s66, s27, 0xffff +; SI-NEXT: s_and_b32 s55, s81, 0xffff +; SI-NEXT: s_and_b32 s52, s84, 0xffff +; SI-NEXT: s_and_b32 s49, s86, 0xffff +; SI-NEXT: s_and_b32 s38, s96, 0xffff +; SI-NEXT: s_and_b32 s35, s99, 0xffff +; SI-NEXT: s_and_b32 s30, s98, 0xffff +; SI-NEXT: s_and_b32 s93, s59, 0xffff +; SI-NEXT: s_and_b32 s90, s62, 0xffff +; SI-NEXT: s_bfe_u32 s83, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s80, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s70, s27, 0x80008 +; SI-NEXT: s_bfe_u32 s67, s81, 0x80008 +; SI-NEXT: s_bfe_u32 s64, s84, 0x80008 +; SI-NEXT: s_bfe_u32 s53, s86, 0x80008 +; SI-NEXT: s_bfe_u32 s50, s96, 0x80008 +; SI-NEXT: s_bfe_u32 s39, s99, 0x80008 +; SI-NEXT: s_bfe_u32 s36, s98, 0x80008 +; SI-NEXT: s_bfe_u32 s31, s59, 0x80008 +; SI-NEXT: s_bfe_u32 s94, s62, 0x80008 +; SI-NEXT: s_bfe_u32 s91, s72, 0x80008 +; SI-NEXT: v_writelane_b32 v62, s4, 13 +; SI-NEXT: v_alignbit_b32 v45, s13, v8, 24 +; SI-NEXT: v_alignbit_b32 v47, s13, v8, 16 +; SI-NEXT: v_alignbit_b32 v57, s13, v8, 8 +; SI-NEXT: v_alignbit_b32 v41, s12, v5, 24 +; SI-NEXT: v_alignbit_b32 v43, s12, v5, 16 +; SI-NEXT: v_alignbit_b32 v44, s12, v5, 8 +; SI-NEXT: v_alignbit_b32 v21, s11, v13, 24 +; SI-NEXT: v_alignbit_b32 v22, s11, v13, 16 +; SI-NEXT: v_alignbit_b32 v24, s11, v13, 8 +; SI-NEXT: v_alignbit_b32 v17, s10, v9, 24 +; SI-NEXT: v_alignbit_b32 v18, s10, v9, 16 +; SI-NEXT: v_alignbit_b32 v20, s10, v9, 8 +; SI-NEXT: v_alignbit_b32 v59, s9, v6, 24 +; SI-NEXT: v_alignbit_b32 v60, s9, v6, 16 +; SI-NEXT: v_alignbit_b32 v61, s9, v6, 8 +; SI-NEXT: v_alignbit_b32 v46, s8, v4, 24 +; SI-NEXT: v_alignbit_b32 v56, s8, v4, 16 +; SI-NEXT: v_alignbit_b32 v58, s8, v4, 8 +; SI-NEXT: v_alignbit_b32 v55, s7, v2, 24 +; SI-NEXT: v_alignbit_b32 v40, s7, v2, 16 +; SI-NEXT: v_alignbit_b32 v42, s7, v2, 8 +; SI-NEXT: v_alignbit_b32 v52, s6, v1, 24 +; SI-NEXT: v_alignbit_b32 v53, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v54, s6, v1, 8 +; SI-NEXT: s_cbranch_execnz .LBB99_3 +; SI-NEXT: .LBB99_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s88, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s79, s79, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s79, 0xffff +; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s77, s77, 3 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s77, 0xffff +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s75, s75, 3 +; SI-NEXT: s_add_i32 s8, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s75, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s73, s73, 3 +; SI-NEXT: s_add_i32 s9, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s73, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s63, s63, 3 +; SI-NEXT: s_add_i32 s10, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s63, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s60, s60, 3 +; SI-NEXT: s_add_i32 s11, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s60, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s61, s61, 3 +; SI-NEXT: s_add_i32 s12, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s61, 0xffff +; SI-NEXT: s_lshl_b32 s5, s98, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s58, s58, 3 +; SI-NEXT: s_add_i32 s13, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s58, 0xffff +; SI-NEXT: s_lshl_b32 s5, s99, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s97, s97, 3 +; SI-NEXT: s_add_i32 s14, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s97, 0xffff +; SI-NEXT: s_lshl_b32 s5, s96, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s87, s87, 3 +; SI-NEXT: s_add_i32 s15, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s87, 0xffff +; SI-NEXT: s_lshl_b32 s5, s86, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s85, s85, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s85, 0xffff +; SI-NEXT: s_lshl_b32 s5, s84, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s82, s82, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s82, 0xffff +; SI-NEXT: s_lshl_b32 s5, s81, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s46, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s47, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s56, s4, 0x30000 +; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s57, s4, 0x30000 +; SI-NEXT: v_mov_b32_e32 v21, s56 +; SI-NEXT: v_alignbit_b32 v22, s57, v21, 24 +; SI-NEXT: v_alignbit_b32 v50, s57, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s57, v21, 8 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v21, s46 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s47, v21, 24 +; SI-NEXT: s_lshr_b32 s4, s11, 8 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s47, v21, 16 +; SI-NEXT: v_alignbit_b32 v51, s47, v21, 8 +; SI-NEXT: v_mov_b32_e32 v21, s44 +; SI-NEXT: v_writelane_b32 v62, s4, 1 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s45, v21, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 2 +; SI-NEXT: s_lshr_b32 s4, s10, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s45, v21, 16 +; SI-NEXT: v_alignbit_b32 v49, s45, v21, 8 +; SI-NEXT: v_mov_b32_e32 v21, s42 +; SI-NEXT: v_writelane_b32 v62, s4, 3 +; SI-NEXT: s_lshr_b32 s4, s9, 24 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v3 +; SI-NEXT: v_mov_b32_e32 v3, s41 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s43, v21, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 4 +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v31 +; SI-NEXT: v_or_b32_e32 v7, v14, v7 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s43, v21, 16 +; SI-NEXT: v_alignbit_b32 v48, s43, v21, 8 +; SI-NEXT: v_alignbit_b32 v21, v3, v16, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 5 +; SI-NEXT: s_lshr_b32 s4, s9, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v7 +; SI-NEXT: v_mov_b32_e32 v7, s40 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v3, v16, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v16, 8 +; SI-NEXT: v_writelane_b32 v62, s4, 6 +; SI-NEXT: s_lshr_b32 s4, s8, 24 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v7, v14, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 7 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v27 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v7, v14, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 8 +; SI-NEXT: s_lshr_b32 s4, s8, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v11 +; SI-NEXT: v_mov_b32_e32 v11, s15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v7, v14, 8 +; SI-NEXT: v_writelane_b32 v62, s4, 9 +; SI-NEXT: s_lshr_b32 s4, s7, 24 +; SI-NEXT: v_or_b32_e32 v5, v29, v5 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v11, v12, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v19 +; SI-NEXT: v_or_b32_e32 v10, v25, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v11, v12, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 11 +; SI-NEXT: s_lshr_b32 s4, s7, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 +; SI-NEXT: v_mov_b32_e32 v15, s14 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v11, v12, 8 +; SI-NEXT: v_writelane_b32 v62, s4, 12 +; SI-NEXT: s_lshr_b32 s4, s6, 24 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v5, v28, v5 +; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v15, v10, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 13 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_mov_b32_e32 v35, s6 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_mov_b32_e32 v34, s7 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_mov_b32_e32 v33, s8 +; SI-NEXT: v_mov_b32_e32 v32, s9 +; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: v_mov_b32_e32 v17, s11 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: v_mov_b32_e32 v18, s12 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 +; SI-NEXT: v_mov_b32_e32 v19, s13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v15, v10, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: s_lshr_b32 s4, s6, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v15, v10, 8 +; SI-NEXT: v_alignbit_b32 v45, v19, v8, 24 +; SI-NEXT: v_alignbit_b32 v47, v19, v8, 16 +; SI-NEXT: v_alignbit_b32 v57, v19, v8, 8 +; SI-NEXT: v_alignbit_b32 v41, v18, v5, 24 +; SI-NEXT: v_alignbit_b32 v43, v18, v5, 16 +; SI-NEXT: v_alignbit_b32 v44, v18, v5, 8 +; SI-NEXT: v_alignbit_b32 v21, v17, v13, 24 +; SI-NEXT: v_alignbit_b32 v22, v17, v13, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v13, 8 +; SI-NEXT: v_alignbit_b32 v17, v20, v9, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v9, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v9, 8 +; SI-NEXT: v_alignbit_b32 v59, v32, v6, 24 +; SI-NEXT: v_alignbit_b32 v60, v32, v6, 16 +; SI-NEXT: v_alignbit_b32 v61, v32, v6, 8 +; SI-NEXT: v_alignbit_b32 v46, v33, v4, 24 +; SI-NEXT: v_alignbit_b32 v56, v33, v4, 16 +; SI-NEXT: v_alignbit_b32 v58, v33, v4, 8 +; SI-NEXT: v_alignbit_b32 v55, v34, v2, 24 +; SI-NEXT: v_alignbit_b32 v40, v34, v2, 16 +; SI-NEXT: v_alignbit_b32 v42, v34, v2, 8 +; SI-NEXT: v_alignbit_b32 v52, v35, v1, 24 +; SI-NEXT: v_alignbit_b32 v53, v35, v1, 16 +; SI-NEXT: v_alignbit_b32 v54, v35, v1, 8 +; SI-NEXT: s_lshr_b32 s83, s57, 24 +; SI-NEXT: s_lshr_b32 s71, s57, 16 +; SI-NEXT: s_lshr_b32 s68, s57, 8 +; SI-NEXT: s_lshr_b32 s80, s47, 24 +; SI-NEXT: s_lshr_b32 s69, s47, 16 +; SI-NEXT: s_lshr_b32 s65, s47, 8 +; SI-NEXT: s_lshr_b32 s70, s45, 24 +; SI-NEXT: s_lshr_b32 s66, s45, 16 +; SI-NEXT: s_lshr_b32 s54, s45, 8 +; SI-NEXT: s_lshr_b32 s67, s43, 24 +; SI-NEXT: s_lshr_b32 s55, s43, 16 +; SI-NEXT: s_lshr_b32 s51, s43, 8 +; SI-NEXT: s_lshr_b32 s64, s41, 24 +; SI-NEXT: s_lshr_b32 s52, s41, 16 +; SI-NEXT: s_lshr_b32 s48, s41, 8 +; SI-NEXT: s_lshr_b32 s53, s40, 24 +; SI-NEXT: s_lshr_b32 s49, s40, 16 +; SI-NEXT: s_lshr_b32 s37, s40, 8 +; SI-NEXT: s_lshr_b32 s50, s15, 24 +; SI-NEXT: s_lshr_b32 s38, s15, 16 +; SI-NEXT: s_lshr_b32 s34, s15, 8 +; SI-NEXT: s_lshr_b32 s39, s14, 24 +; SI-NEXT: s_lshr_b32 s35, s14, 16 +; SI-NEXT: s_lshr_b32 s95, s14, 8 +; SI-NEXT: s_lshr_b32 s36, s13, 24 +; SI-NEXT: s_lshr_b32 s30, s13, 16 +; SI-NEXT: s_lshr_b32 s92, s13, 8 +; SI-NEXT: s_lshr_b32 s31, s12, 24 +; SI-NEXT: s_lshr_b32 s93, s12, 16 +; SI-NEXT: s_lshr_b32 s89, s12, 8 +; SI-NEXT: s_lshr_b32 s94, s11, 24 +; SI-NEXT: s_lshr_b32 s90, s11, 16 +; SI-NEXT: s_lshr_b32 s91, s10, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: .LBB99_3: ; %end +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_and_b32 s4, s56, 0xff +; SI-NEXT: s_lshl_b32 s5, s68, 8 +; SI-NEXT: s_lshl_b32 s16, s83, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s71, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s46, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v51 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s69, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s80, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s54, 8 +; SI-NEXT: s_lshl_b32 s16, s70, 24 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s44, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v49 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s66, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s51, 8 +; SI-NEXT: s_lshl_b32 s16, s67, 24 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s42, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v48 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s55, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: s_lshl_b32 s16, s64, 24 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s52, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s37, 8 +; SI-NEXT: s_lshl_b32 s16, s53, 24 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s49, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s38, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s15, s50, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s15, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s95, 8 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s35, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s14, s39, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s92, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v57 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v47 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s30, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v45 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s13, s36, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s13, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v44 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s89, 8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v43 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s93, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v41 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s12, s31, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 1 +; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v24 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s90, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v21 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s11, s94, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s11, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 3 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v20 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 2 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s10, s91, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s10, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 6 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v61 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v60 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v59 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s9, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 9 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v58 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 8 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v56 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: v_readlane_b32 s8, v62, 7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v46 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 12 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v42 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 11 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v40 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: v_readlane_b32 s7, v62, 10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v55 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s7, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 15 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 14 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: v_readlane_b32 s6, v62, 13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB99_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_mov_b32_e32 v39, v24 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_mov_b32_e32 v38, v22 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_mov_b32_e32 v37, v21 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_mov_b32_e32 v30, v18 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_mov_b32_e32 v29, v17 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_mov_b32_e32 v28, v13 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_mov_b32_e32 v26, v9 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; kill: killed $vcc_lo +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; kill: killed $vcc_lo +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; kill: killed $vcc_lo +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB99_2 +; +; VI-LABEL: bitcast_v64i16_to_v128i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s34, 2 +; VI-NEXT: v_writelane_b32 v20, s35, 3 +; VI-NEXT: v_writelane_b32 v20, s36, 4 +; VI-NEXT: v_writelane_b32 v20, s37, 5 +; VI-NEXT: v_writelane_b32 v20, s38, 6 +; VI-NEXT: v_writelane_b32 v20, s39, 7 +; VI-NEXT: v_writelane_b32 v20, s48, 8 +; VI-NEXT: v_writelane_b32 v20, s49, 9 +; VI-NEXT: v_writelane_b32 v20, s50, 10 +; VI-NEXT: v_writelane_b32 v20, s51, 11 +; VI-NEXT: v_writelane_b32 v20, s52, 12 +; VI-NEXT: v_writelane_b32 v20, s53, 13 +; VI-NEXT: v_writelane_b32 v20, s54, 14 +; VI-NEXT: v_writelane_b32 v20, s55, 15 +; VI-NEXT: v_writelane_b32 v20, s64, 16 +; VI-NEXT: v_writelane_b32 v20, s65, 17 +; VI-NEXT: v_writelane_b32 v20, s66, 18 +; VI-NEXT: v_writelane_b32 v20, s67, 19 +; VI-NEXT: v_writelane_b32 v20, s68, 20 +; VI-NEXT: v_writelane_b32 v20, s69, 21 +; VI-NEXT: v_writelane_b32 v20, s70, 22 +; VI-NEXT: v_writelane_b32 v20, s71, 23 +; VI-NEXT: v_writelane_b32 v20, s80, 24 +; VI-NEXT: v_writelane_b32 v20, s81, 25 +; VI-NEXT: v_writelane_b32 v20, s82, 26 +; VI-NEXT: v_writelane_b32 v20, s83, 27 +; VI-NEXT: v_writelane_b32 v20, s84, 28 +; VI-NEXT: v_writelane_b32 v20, s85, 29 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_writelane_b32 v20, s86, 30 +; VI-NEXT: v_readfirstlane_b32 s42, v3 +; VI-NEXT: v_readfirstlane_b32 s43, v4 +; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s41, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s15, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s11, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s9, v14 +; VI-NEXT: v_readfirstlane_b32 s6, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v16 +; VI-NEXT: v_readfirstlane_b32 s4, v17 +; VI-NEXT: v_readfirstlane_b32 s5, v18 +; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s45, v2 +; VI-NEXT: v_writelane_b32 v20, s87, 31 +; VI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; VI-NEXT: s_cbranch_scc0 .LBB99_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 0 +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 1 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 2 +; VI-NEXT: s_lshr_b32 s46, s44, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 3 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 4 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 5 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 6 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 7 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s27, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s26, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s26, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s25, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s24, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s24, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s23, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s23, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s23, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s22, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s22, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s12, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s41, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s41, 16 +; VI-NEXT: s_lshr_b32 s80, s21, 16 +; VI-NEXT: s_lshr_b32 s82, s21, 8 +; VI-NEXT: s_lshr_b32 s84, s20, 16 +; VI-NEXT: s_lshr_b32 s86, s20, 8 +; VI-NEXT: s_lshr_b32 s51, s19, 24 +; VI-NEXT: s_lshr_b32 s53, s19, 16 +; VI-NEXT: s_lshr_b32 s54, s19, 8 +; VI-NEXT: s_lshr_b32 s65, s18, 16 +; VI-NEXT: s_lshr_b32 s66, s18, 8 +; VI-NEXT: s_lshr_b32 s67, s17, 24 +; VI-NEXT: s_lshr_b32 s68, s17, 16 +; VI-NEXT: s_lshr_b32 s69, s17, 8 +; VI-NEXT: s_lshr_b32 s70, s16, 16 +; VI-NEXT: s_lshr_b32 s71, s16, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 57 +; VI-NEXT: s_lshr_b32 s81, s41, 8 +; VI-NEXT: s_lshr_b32 s83, s40, 16 +; VI-NEXT: s_lshr_b32 s85, s40, 8 +; VI-NEXT: s_lshr_b32 s87, s43, 24 +; VI-NEXT: s_lshr_b32 s50, s43, 16 +; VI-NEXT: s_lshr_b32 s52, s43, 8 +; VI-NEXT: s_lshr_b32 s55, s42, 16 +; VI-NEXT: s_lshr_b32 s64, s42, 8 +; VI-NEXT: s_lshr_b64 s[76:77], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[42:43], 24 +; VI-NEXT: s_cbranch_execnz .LBB99_3 +; VI-NEXT: .LBB99_2: ; %cmp.true +; VI-NEXT: s_and_b32 s47, s45, 0xffff0000 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_and_b32 s45, s45, 0xffff +; VI-NEXT: s_and_b32 s46, s44, 0xffff0000 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_or_b32 s45, s47, s45 +; VI-NEXT: s_add_i32 s45, s45, 0x30000 +; VI-NEXT: s_and_b32 s44, s44, 0xffff +; VI-NEXT: s_or_b32 s44, s46, s44 +; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: s_and_b32 s57, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 0 +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_add_i32 s44, s44, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 1 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: s_or_b32 s29, s57, s29 +; VI-NEXT: v_writelane_b32 v21, s46, 2 +; VI-NEXT: s_lshr_b32 s46, s44, 16 +; VI-NEXT: s_and_b32 s56, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 3 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: v_writelane_b32 v21, s46, 4 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: s_and_b32 s59, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_or_b32 s28, s56, s28 +; VI-NEXT: v_writelane_b32 v21, s46, 5 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 6 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: s_or_b32 s27, s59, s27 +; VI-NEXT: v_writelane_b32 v21, s46, 7 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: s_and_b32 s58, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: v_writelane_b32 v21, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: s_and_b32 s61, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_or_b32 s26, s58, s26 +; VI-NEXT: v_writelane_b32 v21, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s27, 8 +; VI-NEXT: s_or_b32 s25, s61, s25 +; VI-NEXT: v_writelane_b32 v21, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s26, 16 +; VI-NEXT: s_and_b32 s60, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s25, s25, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s26, 8 +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: v_writelane_b32 v21, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s25, 24 +; VI-NEXT: s_and_b32 s63, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_or_b32 s24, s60, s24 +; VI-NEXT: v_writelane_b32 v21, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_add_i32 s24, s24, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 8 +; VI-NEXT: s_or_b32 s23, s63, s23 +; VI-NEXT: v_writelane_b32 v21, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s24, 16 +; VI-NEXT: s_and_b32 s62, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s23, s23, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s24, 8 +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: v_writelane_b32 v21, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s23, 24 +; VI-NEXT: s_and_b32 s73, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_or_b32 s22, s62, s22 +; VI-NEXT: v_writelane_b32 v21, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s23, 16 +; VI-NEXT: s_and_b32 s79, s5, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s5, 3 +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_add_i32 s22, s22, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s23, 8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s21, s73, s21 +; VI-NEXT: v_writelane_b32 v21, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s22, 16 +; VI-NEXT: s_or_b32 s5, s79, s5 +; VI-NEXT: s_add_i32 s21, s21, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s22, 8 +; VI-NEXT: s_and_b32 s78, s4, 0xffff0000 +; VI-NEXT: s_add_i32 s4, s4, 3 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_writelane_b32 v21, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: s_and_b32 s89, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_or_b32 s4, s78, s4 +; VI-NEXT: v_writelane_b32 v21, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: s_or_b32 s7, s89, s7 +; VI-NEXT: v_writelane_b32 v21, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: s_and_b32 s88, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_writelane_b32 v21, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: s_and_b32 s91, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_or_b32 s6, s88, s6 +; VI-NEXT: v_writelane_b32 v21, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: s_or_b32 s9, s91, s9 +; VI-NEXT: v_writelane_b32 v21, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: s_and_b32 s90, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: v_writelane_b32 v21, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: s_and_b32 vcc_hi, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_or_b32 s8, s90, s8 +; VI-NEXT: v_writelane_b32 v21, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: s_or_b32 s11, vcc_hi, s11 +; VI-NEXT: v_writelane_b32 v21, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_and_b32 vcc_lo, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: v_writelane_b32 v21, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: s_and_b32 s31, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_or_b32 s10, vcc_lo, s10 +; VI-NEXT: v_writelane_b32 v21, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: s_or_b32 s13, s31, s13 +; VI-NEXT: v_writelane_b32 v21, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_and_b32 s30, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: v_writelane_b32 v21, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: s_and_b32 s35, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_or_b32 s12, s30, s12 +; VI-NEXT: v_writelane_b32 v21, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: s_or_b32 s15, s35, s15 +; VI-NEXT: v_writelane_b32 v21, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_and_b32 s34, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s12, 8 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: v_writelane_b32 v21, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: s_and_b32 s37, s41, 0xffff0000 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_or_b32 s14, s34, s14 +; VI-NEXT: v_writelane_b32 v21, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: s_and_b32 s41, s41, 0xffff +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: s_and_b32 s72, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s74, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s75, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s76, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s77, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s36, s40, 0xffff0000 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_and_b32 s38, s42, 0xffff0000 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_and_b32 s39, s43, 0xffff0000 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_or_b32 s41, s37, s41 +; VI-NEXT: v_writelane_b32 v21, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: s_and_b32 s43, s43, 0xffff +; VI-NEXT: s_and_b32 s42, s42, 0xffff +; VI-NEXT: s_add_i32 s41, s41, 0x30000 +; VI-NEXT: s_and_b32 s40, s40, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: v_writelane_b32 v21, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: s_or_b32 s43, s39, s43 +; VI-NEXT: s_or_b32 s42, s38, s42 +; VI-NEXT: s_or_b32 s40, s36, s40 +; VI-NEXT: s_or_b32 s17, s77, s17 +; VI-NEXT: s_or_b32 s16, s76, s16 +; VI-NEXT: s_or_b32 s19, s75, s19 +; VI-NEXT: s_or_b32 s18, s74, s18 +; VI-NEXT: s_or_b32 s20, s72, s20 +; VI-NEXT: v_writelane_b32 v21, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s41, 24 +; VI-NEXT: s_add_i32 s43, s43, 0x30000 +; VI-NEXT: s_add_i32 s42, s42, 0x30000 +; VI-NEXT: s_add_i32 s40, s40, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s41, 16 +; VI-NEXT: s_lshr_b32 s80, s21, 16 +; VI-NEXT: s_lshr_b32 s82, s21, 8 +; VI-NEXT: s_lshr_b32 s84, s20, 16 +; VI-NEXT: s_lshr_b32 s86, s20, 8 +; VI-NEXT: s_lshr_b32 s51, s19, 24 +; VI-NEXT: s_lshr_b32 s53, s19, 16 +; VI-NEXT: s_lshr_b32 s54, s19, 8 +; VI-NEXT: s_lshr_b32 s65, s18, 16 +; VI-NEXT: s_lshr_b32 s66, s18, 8 +; VI-NEXT: s_lshr_b32 s67, s17, 24 +; VI-NEXT: s_lshr_b32 s68, s17, 16 +; VI-NEXT: s_lshr_b32 s69, s17, 8 +; VI-NEXT: s_lshr_b32 s70, s16, 16 +; VI-NEXT: s_lshr_b32 s71, s16, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 57 +; VI-NEXT: s_lshr_b32 s81, s41, 8 +; VI-NEXT: s_lshr_b32 s83, s40, 16 +; VI-NEXT: s_lshr_b32 s85, s40, 8 +; VI-NEXT: s_lshr_b32 s87, s43, 24 +; VI-NEXT: s_lshr_b32 s50, s43, 16 +; VI-NEXT: s_lshr_b32 s52, s43, 8 +; VI-NEXT: s_lshr_b32 s55, s42, 16 +; VI-NEXT: s_lshr_b32 s64, s42, 8 +; VI-NEXT: s_lshr_b64 s[76:77], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[42:43], 24 +; VI-NEXT: .LBB99_3: ; %end +; VI-NEXT: s_lshl_b32 s47, s71, 8 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_or_b32 s16, s16, s47 +; VI-NEXT: s_lshl_b32 s47, s48, 8 +; VI-NEXT: s_and_b32 s57, s70, 0xff +; VI-NEXT: s_or_b32 s47, s57, s47 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s47, s47, 16 +; VI-NEXT: s_or_b32 s16, s16, s47 +; VI-NEXT: v_mov_b32_e32 v1, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xff +; VI-NEXT: s_lshl_b32 s17, s69, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s68, 0xff +; VI-NEXT: s_lshl_b32 s47, s67, 8 +; VI-NEXT: s_or_b32 s17, s17, s47 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_lshl_b32 s16, s66, 8 +; VI-NEXT: s_and_b32 s17, s18, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s38, 8 +; VI-NEXT: s_and_b32 s18, s65, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xff +; VI-NEXT: s_lshl_b32 s17, s54, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s53, 0xff +; VI-NEXT: s_lshl_b32 s18, s51, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: s_lshl_b32 s16, s86, 8 +; VI-NEXT: s_and_b32 s17, s20, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s36, 8 +; VI-NEXT: s_and_b32 s18, s84, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v5, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xff +; VI-NEXT: s_lshl_b32 s17, s82, 8 +; VI-NEXT: v_readlane_b32 s18, v21, 25 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s80, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v6, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 24 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s22, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 23 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s34, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 22 +; VI-NEXT: v_mov_b32_e32 v7, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 21 +; VI-NEXT: v_readlane_b32 s18, v21, 20 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v8, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 19 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s24, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 18 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s30, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 17 +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 16 +; VI-NEXT: v_readlane_b32 s18, v21, 15 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 14 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s26, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 13 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s90, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 12 +; VI-NEXT: v_mov_b32_e32 v11, s16 +; VI-NEXT: s_and_b32 s16, s27, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 11 +; VI-NEXT: v_readlane_b32 s18, v21, 10 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v12, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 9 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s28, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 8 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s88, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 7 +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: s_and_b32 s16, s29, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 6 +; VI-NEXT: v_readlane_b32 s18, v21, 5 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v14, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 4 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s44, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s76, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 2 +; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: v_mov_b32_e32 v15, s16 +; VI-NEXT: s_and_b32 s16, s45, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: v_readlane_b32 s17, v21, 1 +; VI-NEXT: v_readlane_b32 s18, v21, 0 +; VI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s42, 0xff +; VI-NEXT: s_lshl_b32 s17, s64, 8 +; VI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s55, 0xff +; VI-NEXT: s_lshl_b32 s18, s78, 8 +; VI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s43, 0xff +; VI-NEXT: s_lshl_b32 s17, s52, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s50, 0xff +; VI-NEXT: s_lshl_b32 s18, s87, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 64, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s40, 0xff +; VI-NEXT: s_lshl_b32 s17, s85, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s83, 0xff +; VI-NEXT: s_lshl_b32 s18, s74, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s41, 0xff +; VI-NEXT: s_lshl_b32 s17, s81, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 57 +; VI-NEXT: v_readlane_b32 s18, v21, 56 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x48, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 55 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 54 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_lshl_b32 s17, s72, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x4c, v0 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s14, s15, 0xff +; VI-NEXT: v_readlane_b32 s15, v21, 53 +; VI-NEXT: s_lshl_b32 s15, s15, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: v_readlane_b32 s15, v21, 52 +; VI-NEXT: v_readlane_b32 s16, v21, 51 +; VI-NEXT: s_and_b32 s15, s15, 0xff +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x50, v0 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_readlane_b32 s14, v21, 50 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s14, s14, 8 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: v_readlane_b32 s14, v21, 49 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s15, s62, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x54, v0 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s12, s13, 0xff +; VI-NEXT: v_readlane_b32 s13, v21, 48 +; VI-NEXT: s_lshl_b32 s13, s13, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: v_readlane_b32 s13, v21, 47 +; VI-NEXT: v_readlane_b32 s14, v21, 46 +; VI-NEXT: s_and_b32 s13, s13, 0xff +; VI-NEXT: s_lshl_b32 s14, s14, 8 +; VI-NEXT: s_or_b32 s13, s13, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x58, v0 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: v_readlane_b32 s12, v21, 45 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s12, s12, 8 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: v_readlane_b32 s12, v21, 44 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s13, s60, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v0 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s10, s11, 0xff +; VI-NEXT: v_readlane_b32 s11, v21, 43 +; VI-NEXT: s_lshl_b32 s11, s11, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: v_readlane_b32 s11, v21, 42 +; VI-NEXT: v_readlane_b32 s12, v21, 41 +; VI-NEXT: s_and_b32 s11, s11, 0xff +; VI-NEXT: s_lshl_b32 s12, s12, 8 +; VI-NEXT: s_or_b32 s11, s11, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x60, v0 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_readlane_b32 s10, v21, 40 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: v_readlane_b32 s10, v21, 39 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s11, s58, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x64, v0 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s8, s9, 0xff +; VI-NEXT: v_readlane_b32 s9, v21, 38 +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: v_readlane_b32 s9, v21, 37 +; VI-NEXT: v_readlane_b32 s10, v21, 36 +; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x68, v0 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_readlane_b32 s8, v21, 35 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_readlane_b32 s8, v21, 34 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s9, s56, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v0 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_and_b32 s6, s7, 0xff +; VI-NEXT: v_readlane_b32 s7, v21, 33 +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_readlane_b32 s7, v21, 32 +; VI-NEXT: v_readlane_b32 s8, v21, 31 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x70, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_readlane_b32 s6, v21, 30 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_readlane_b32 s6, v21, 29 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s7, s46, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x74, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: v_readlane_b32 s5, v21, 28 +; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_readlane_b32 s5, v21, 27 +; VI-NEXT: v_readlane_b32 s6, v21, 26 +; VI-NEXT: s_and_b32 s5, s5, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s87, v20, 31 +; VI-NEXT: v_readlane_b32 s86, v20, 30 +; VI-NEXT: v_readlane_b32 s85, v20, 29 +; VI-NEXT: v_readlane_b32 s84, v20, 28 +; VI-NEXT: v_readlane_b32 s83, v20, 27 +; VI-NEXT: v_readlane_b32 s82, v20, 26 +; VI-NEXT: v_readlane_b32 s81, v20, 25 +; VI-NEXT: v_readlane_b32 s80, v20, 24 +; VI-NEXT: v_readlane_b32 s71, v20, 23 +; VI-NEXT: v_readlane_b32 s70, v20, 22 +; VI-NEXT: v_readlane_b32 s69, v20, 21 +; VI-NEXT: v_readlane_b32 s68, v20, 20 +; VI-NEXT: v_readlane_b32 s67, v20, 19 +; VI-NEXT: v_readlane_b32 s66, v20, 18 +; VI-NEXT: v_readlane_b32 s65, v20, 17 +; VI-NEXT: v_readlane_b32 s64, v20, 16 +; VI-NEXT: v_readlane_b32 s55, v20, 15 +; VI-NEXT: v_readlane_b32 s54, v20, 14 +; VI-NEXT: v_readlane_b32 s53, v20, 13 +; VI-NEXT: v_readlane_b32 s52, v20, 12 +; VI-NEXT: v_readlane_b32 s51, v20, 11 +; VI-NEXT: v_readlane_b32 s50, v20, 10 +; VI-NEXT: v_readlane_b32 s49, v20, 9 +; VI-NEXT: v_readlane_b32 s48, v20, 8 +; VI-NEXT: v_readlane_b32 s39, v20, 7 +; VI-NEXT: v_readlane_b32 s38, v20, 6 +; VI-NEXT: v_readlane_b32 s37, v20, 5 +; VI-NEXT: v_readlane_b32 s36, v20, 4 +; VI-NEXT: v_readlane_b32 s35, v20, 3 +; VI-NEXT: v_readlane_b32 s34, v20, 2 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB99_4: +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB99_2 +; +; GFX9-LABEL: bitcast_v64i16_to_v128i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v63, s30, 0 +; GFX9-NEXT: v_writelane_b32 v63, s31, 1 +; GFX9-NEXT: v_writelane_b32 v63, s34, 2 +; GFX9-NEXT: v_writelane_b32 v63, s35, 3 +; GFX9-NEXT: v_writelane_b32 v63, s36, 4 +; GFX9-NEXT: v_writelane_b32 v63, s37, 5 +; GFX9-NEXT: v_writelane_b32 v63, s38, 6 +; GFX9-NEXT: v_writelane_b32 v63, s39, 7 +; GFX9-NEXT: v_writelane_b32 v63, s48, 8 +; GFX9-NEXT: v_writelane_b32 v63, s49, 9 +; GFX9-NEXT: v_writelane_b32 v63, s50, 10 +; GFX9-NEXT: v_writelane_b32 v63, s51, 11 +; GFX9-NEXT: v_writelane_b32 v63, s52, 12 +; GFX9-NEXT: v_writelane_b32 v63, s53, 13 +; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_writelane_b32 v63, s64, 16 +; GFX9-NEXT: v_writelane_b32 v63, s65, 17 +; GFX9-NEXT: v_writelane_b32 v63, s66, 18 +; GFX9-NEXT: v_writelane_b32 v63, s67, 19 +; GFX9-NEXT: v_writelane_b32 v63, s68, 20 +; GFX9-NEXT: v_writelane_b32 v63, s69, 21 +; GFX9-NEXT: v_writelane_b32 v63, s70, 22 +; GFX9-NEXT: v_writelane_b32 v63, s71, 23 +; GFX9-NEXT: v_writelane_b32 v63, s80, 24 +; GFX9-NEXT: v_writelane_b32 v63, s81, 25 +; GFX9-NEXT: v_writelane_b32 v63, s82, 26 +; GFX9-NEXT: v_writelane_b32 v63, s83, 27 +; GFX9-NEXT: v_writelane_b32 v63, s84, 28 +; GFX9-NEXT: v_writelane_b32 v63, s85, 29 +; GFX9-NEXT: v_writelane_b32 v63, s86, 30 +; GFX9-NEXT: v_writelane_b32 v63, s87, 31 +; GFX9-NEXT: v_writelane_b32 v63, s96, 32 +; GFX9-NEXT: v_writelane_b32 v63, s97, 33 +; GFX9-NEXT: v_writelane_b32 v63, s98, 34 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_writelane_b32 v63, s99, 35 +; GFX9-NEXT: v_readfirstlane_b32 s44, v3 +; GFX9-NEXT: v_readfirstlane_b32 s45, v4 +; GFX9-NEXT: v_readfirstlane_b32 s42, v5 +; GFX9-NEXT: v_readfirstlane_b32 s43, v6 +; GFX9-NEXT: v_readfirstlane_b32 s40, v7 +; GFX9-NEXT: v_readfirstlane_b32 s41, v8 +; GFX9-NEXT: v_readfirstlane_b32 s14, v9 +; GFX9-NEXT: v_readfirstlane_b32 s15, v10 +; GFX9-NEXT: v_readfirstlane_b32 s12, v11 +; GFX9-NEXT: v_readfirstlane_b32 s13, v12 +; GFX9-NEXT: v_readfirstlane_b32 s10, v13 +; GFX9-NEXT: v_readfirstlane_b32 s11, v14 +; GFX9-NEXT: v_readfirstlane_b32 s8, v15 +; GFX9-NEXT: v_readfirstlane_b32 s9, v16 +; GFX9-NEXT: v_readfirstlane_b32 s6, v17 +; GFX9-NEXT: v_readfirstlane_b32 s7, v18 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; GFX9-NEXT: s_cbranch_scc0 .LBB99_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s5, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 49 +; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s5, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 47 +; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 46 +; GFX9-NEXT: s_lshr_b32 s46, s4, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 45 +; GFX9-NEXT: s_lshr_b32 s46, s29, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 44 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 43 +; GFX9-NEXT: s_lshr_b32 s46, s29, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 42 +; GFX9-NEXT: s_lshr_b32 s46, s28, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s28, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s27, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s27, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s27, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s26, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s26, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s25, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s25, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s24, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s24, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s23, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s23, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s23, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s22, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s22, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s21, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s20, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s20, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s19, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s19, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s18, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s17, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s17, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s17, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 12 +; GFX9-NEXT: s_lshr_b32 s46, s16, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 11 +; GFX9-NEXT: s_lshr_b32 s46, s16, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 10 +; GFX9-NEXT: s_lshr_b32 s46, s7, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 9 +; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 8 +; GFX9-NEXT: s_lshr_b32 s46, s7, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 7 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 6 +; GFX9-NEXT: s_lshr_b32 s46, s6, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 5 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 4 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 3 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 2 +; GFX9-NEXT: s_lshr_b32 s46, s8, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 1 +; GFX9-NEXT: s_lshr_b32 s46, s8, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 0 +; GFX9-NEXT: s_lshr_b32 s82, s11, 24 +; GFX9-NEXT: s_lshr_b32 s83, s11, 16 +; GFX9-NEXT: s_lshr_b32 s85, s11, 8 +; GFX9-NEXT: s_lshr_b32 s84, s10, 16 +; GFX9-NEXT: s_lshr_b32 s86, s10, 8 +; GFX9-NEXT: s_lshr_b32 s87, s13, 24 +; GFX9-NEXT: s_lshr_b32 s96, s13, 16 +; GFX9-NEXT: s_lshr_b32 s98, s13, 8 +; GFX9-NEXT: s_lshr_b32 s97, s12, 16 +; GFX9-NEXT: s_lshr_b32 s99, s12, 8 +; GFX9-NEXT: s_lshr_b32 s38, s15, 24 +; GFX9-NEXT: s_lshr_b32 s39, s15, 16 +; GFX9-NEXT: s_lshr_b32 s49, s15, 8 +; GFX9-NEXT: s_lshr_b32 s48, s14, 16 +; GFX9-NEXT: s_lshr_b32 s50, s14, 8 +; GFX9-NEXT: s_lshr_b32 s51, s41, 24 +; GFX9-NEXT: s_lshr_b32 s52, s41, 16 +; GFX9-NEXT: s_lshr_b32 s54, s41, 8 +; GFX9-NEXT: s_lshr_b32 s53, s40, 16 +; GFX9-NEXT: s_lshr_b32 s55, s40, 8 +; GFX9-NEXT: s_lshr_b32 s64, s43, 24 +; GFX9-NEXT: s_lshr_b32 s65, s43, 16 +; GFX9-NEXT: s_lshr_b32 s67, s43, 8 +; GFX9-NEXT: s_lshr_b32 s66, s42, 16 +; GFX9-NEXT: s_lshr_b32 s68, s42, 8 +; GFX9-NEXT: s_lshr_b32 s69, s45, 24 +; GFX9-NEXT: s_lshr_b32 s70, s45, 16 +; GFX9-NEXT: s_lshr_b32 s80, s45, 8 +; GFX9-NEXT: s_lshr_b32 s71, s44, 16 +; GFX9-NEXT: s_lshr_b32 s81, s44, 8 +; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB99_4 +; GFX9-NEXT: .LBB99_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v26, s5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[25:26] +; GFX9-NEXT: v_pk_add_u16 v28, s29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, s28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[27:28] +; GFX9-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[29:30] +; GFX9-NEXT: v_pk_add_u16 v32, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v31, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[31:32] +; GFX9-NEXT: v_pk_add_u16 v34, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v33, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[33:34] +; GFX9-NEXT: v_pk_add_u16 v36, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v35, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[35:36] +; GFX9-NEXT: v_pk_add_u16 v38, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v37, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[37:38] +; GFX9-NEXT: v_pk_add_u16 v49, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v48, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[48:49] +; GFX9-NEXT: v_pk_add_u16 v2, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_pk_add_u16 v4, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_pk_add_u16 v6, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX9-NEXT: v_pk_add_u16 v8, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX9-NEXT: v_pk_add_u16 v10, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: v_pk_add_u16 v12, s41, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s40, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] +; GFX9-NEXT: v_pk_add_u16 v14, s43, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s42, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] +; GFX9-NEXT: v_pk_add_u16 v22, s45, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, s44, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v3 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v8 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v28 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v30 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v30 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v29 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v32 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v32 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v34 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v33 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v36 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v36 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v35 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v38 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v38 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v38 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v4 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v21 +; GFX9-NEXT: s_branch .LBB99_5 +; GFX9-NEXT: .LBB99_3: +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr99 +; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr96 +; GFX9-NEXT: ; implicit-def: $sgpr87 +; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr85 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: s_branch .LBB99_2 +; GFX9-NEXT: .LBB99_4: +; GFX9-NEXT: v_mov_b32_e32 v15, s71 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s80 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s70 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s69 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s68 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s66 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s67 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s65 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s64 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s55 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s53 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s54 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s52 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s51 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s50 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s48 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s49 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s39 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s38 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s99 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s97 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s98 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s96 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s87 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s86 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s84 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s85 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s83 +; GFX9-NEXT: v_mov_b32_e32 v25, s4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s82 +; GFX9-NEXT: v_readlane_b32 s4, v62, 0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 1 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 2 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 3 +; GFX9-NEXT: v_mov_b32_e32 v55, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 5 +; GFX9-NEXT: v_mov_b32_e32 v53, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 6 +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 7 +; GFX9-NEXT: v_mov_b32_e32 v51, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 8 +; GFX9-NEXT: v_mov_b32_e32 v50, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 9 +; GFX9-NEXT: v_mov_b32_e32 v24, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 10 +; GFX9-NEXT: v_mov_b32_e32 v20, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 11 +; GFX9-NEXT: v_mov_b32_e32 v42, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 12 +; GFX9-NEXT: v_mov_b32_e32 v18, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 13 +; GFX9-NEXT: v_mov_b32_e32 v39, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 14 +; GFX9-NEXT: v_mov_b32_e32 v43, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 15 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 16 +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 17 +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 18 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 19 +; GFX9-NEXT: v_mov_b32_e32 v54, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 20 +; GFX9-NEXT: v_mov_b32_e32 v61, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 21 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 22 +; GFX9-NEXT: v_mov_b32_e32 v60, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 23 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 24 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 25 +; GFX9-NEXT: v_mov_b32_e32 v23, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 26 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 27 +; GFX9-NEXT: v_mov_b32_e32 v59, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 28 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 29 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 30 +; GFX9-NEXT: v_mov_b32_e32 v58, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 31 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 32 +; GFX9-NEXT: v_mov_b32_e32 v57, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 33 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 34 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 35 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 36 +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 37 +; GFX9-NEXT: v_mov_b32_e32 v56, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 38 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 39 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 40 +; GFX9-NEXT: v_mov_b32_e32 v47, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 41 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 42 +; GFX9-NEXT: v_mov_b32_e32 v46, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 43 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 44 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 45 +; GFX9-NEXT: v_mov_b32_e32 v45, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 46 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 47 +; GFX9-NEXT: v_mov_b32_e32 v44, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 48 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 49 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s46 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s56 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s58 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s60 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s62 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s72 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s74 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s76 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s78 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s88 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s90 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s92 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s94 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s30 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s34 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s36 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v21, s44 +; GFX9-NEXT: v_mov_b32_e32 v22, s45 +; GFX9-NEXT: v_mov_b32_e32 v13, s42 +; GFX9-NEXT: v_mov_b32_e32 v14, s43 +; GFX9-NEXT: v_mov_b32_e32 v11, s40 +; GFX9-NEXT: v_mov_b32_e32 v12, s41 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: v_mov_b32_e32 v10, s15 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v8, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v48, s16 +; GFX9-NEXT: v_mov_b32_e32 v49, s17 +; GFX9-NEXT: v_mov_b32_e32 v37, s18 +; GFX9-NEXT: v_mov_b32_e32 v38, s19 +; GFX9-NEXT: v_mov_b32_e32 v35, s20 +; GFX9-NEXT: v_mov_b32_e32 v36, s21 +; GFX9-NEXT: v_mov_b32_e32 v33, s22 +; GFX9-NEXT: v_mov_b32_e32 v34, s23 +; GFX9-NEXT: v_mov_b32_e32 v31, s24 +; GFX9-NEXT: v_mov_b32_e32 v32, s25 +; GFX9-NEXT: v_mov_b32_e32 v29, s26 +; GFX9-NEXT: v_mov_b32_e32 v30, s27 +; GFX9-NEXT: v_mov_b32_e32 v27, s28 +; GFX9-NEXT: v_mov_b32_e32 v28, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s5 +; GFX9-NEXT: v_mov_b32_e32 v41, v50 +; GFX9-NEXT: v_mov_b32_e32 v50, v51 +; GFX9-NEXT: v_mov_b32_e32 v51, v52 +; GFX9-NEXT: v_mov_b32_e32 v52, v53 +; GFX9-NEXT: v_mov_b32_e32 v53, v55 +; GFX9-NEXT: v_mov_b32_e32 v55, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s81 +; GFX9-NEXT: .LBB99_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v15 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v16, v37, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v35, v35, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v36, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v48, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v23, v33, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_readlane_b32 s99, v63, 35 +; GFX9-NEXT: v_readlane_b32 s98, v63, 34 +; GFX9-NEXT: v_readlane_b32 s97, v63, 33 +; GFX9-NEXT: v_readlane_b32 s96, v63, 32 +; GFX9-NEXT: v_readlane_b32 s87, v63, 31 +; GFX9-NEXT: v_readlane_b32 s86, v63, 30 +; GFX9-NEXT: v_readlane_b32 s85, v63, 29 +; GFX9-NEXT: v_readlane_b32 s84, v63, 28 +; GFX9-NEXT: v_readlane_b32 s83, v63, 27 +; GFX9-NEXT: v_readlane_b32 s82, v63, 26 +; GFX9-NEXT: v_readlane_b32 s81, v63, 25 +; GFX9-NEXT: v_readlane_b32 s80, v63, 24 +; GFX9-NEXT: v_readlane_b32 s71, v63, 23 +; GFX9-NEXT: v_readlane_b32 s70, v63, 22 +; GFX9-NEXT: v_readlane_b32 s69, v63, 21 +; GFX9-NEXT: v_readlane_b32 s68, v63, 20 +; GFX9-NEXT: v_readlane_b32 s67, v63, 19 +; GFX9-NEXT: v_readlane_b32 s66, v63, 18 +; GFX9-NEXT: v_readlane_b32 s65, v63, 17 +; GFX9-NEXT: v_readlane_b32 s64, v63, 16 +; GFX9-NEXT: v_readlane_b32 s55, v63, 15 +; GFX9-NEXT: v_readlane_b32 s54, v63, 14 +; GFX9-NEXT: v_readlane_b32 s53, v63, 13 +; GFX9-NEXT: v_readlane_b32 s52, v63, 12 +; GFX9-NEXT: v_readlane_b32 s51, v63, 11 +; GFX9-NEXT: v_readlane_b32 s50, v63, 10 +; GFX9-NEXT: v_readlane_b32 s49, v63, 9 +; GFX9-NEXT: v_readlane_b32 s48, v63, 8 +; GFX9-NEXT: v_readlane_b32 s39, v63, 7 +; GFX9-NEXT: v_readlane_b32 s38, v63, 6 +; GFX9-NEXT: v_readlane_b32 s37, v63, 5 +; GFX9-NEXT: v_readlane_b32 s36, v63, 4 +; GFX9-NEXT: v_readlane_b32 s35, v63, 3 +; GFX9-NEXT: v_readlane_b32 s34, v63, 2 +; GFX9-NEXT: v_readlane_b32 s31, v63, 1 +; GFX9-NEXT: v_readlane_b32 s30, v63, 0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v15, v38, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v19, v42, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v19, v39, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v40, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v36, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v28, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v53, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v64i16_to_v128i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:88 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s30, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s99, 3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s100, 4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s101, 5 +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_clause 0x12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s102, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s103, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s104, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s50, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s51, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s52, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s53, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s54, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s55, 15 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s64, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s65, 17 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s66, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s67, 19 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s68, 20 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s69, 21 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s70, 22 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s71, 23 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s80, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s81, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s82, 26 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s83, 27 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s84, 28 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s85, 29 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s86, 30 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s87, 31 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[0:1], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s5, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s5, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s5, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s4, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s7, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s7, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s6, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s9, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s8, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s8, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s11, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s10, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s13, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s13, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s12, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s12, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s15, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s15, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s15, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s14, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s41, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s41, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s41, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s40, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s29, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s28, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[62:63], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[72:73], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[40:41], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s2, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s2, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s0, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s0, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s74, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s75, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB99_4 +; GFX11-TRUE16-NEXT: .LBB99_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v51, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v50, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v36, s19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, s29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s40, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v55, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v49, s17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v48, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, s24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[35:36] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[50:51] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[31:32] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[48:49] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[54:55] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 16, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 16, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 24, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 16, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 8, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 16, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 16, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v21 +; GFX11-TRUE16-NEXT: s_branch .LBB99_5 +; GFX11-TRUE16-NEXT: .LBB99_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s43, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB99_2 +; GFX11-TRUE16-NEXT: .LBB99_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 10 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, s34 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, s104 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, s103 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, s102 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 11 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, s101 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, s100 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, s99 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, s98 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 12 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, s97 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, s96 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, s87 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, s86 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, s85 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, s84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.l, s83 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.l, s82 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, s81 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, s80 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, s71 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.l, s70 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, s69 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, s68 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, s67 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, s66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, s65 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s64 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, s52 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, s51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s72 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s74 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s76 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s78 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s88 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s90 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s92 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 24 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 26 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 28 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 30 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 4 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s1, v78, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s0 +; GFX11-TRUE16-NEXT: .LBB99_5: ; %end +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v74 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xff, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v62 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, v54, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v60, 0xff, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v69, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v72 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, v60, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v57, 8, v57 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, v55, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xff, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, v54, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v69, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v50, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v59 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xff, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, v55, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v51, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v81, v57 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xff, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v56, v50, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v48, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v81, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xff, v45 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v44 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v57, v50, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, v49, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v70, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v43 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v42 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v48, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, v35, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v69, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v40 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, v49, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v36, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v67, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v182 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, v67, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v180 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v35, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v36, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v176 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v167 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v36, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, v67, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[54:57], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[48:51], off offset:16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xff, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v160 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v151 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v66, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, v36, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v49, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v27, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v147 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xff, v146 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v150 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xff, v149 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v38, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v49, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v52, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, v31, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v27, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v28, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v21, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v144 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v134 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v32, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v131 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v17, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v129 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v32, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v128 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xff, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v116 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v49, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v51, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, v21, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v17, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v18, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v102 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v103 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v18, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v101 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v96 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v27, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v14, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v9, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xff, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v165 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, v70, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v24, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v26, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v67, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v23, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v20 +; GFX11-TRUE16-NEXT: s_clause 0x5 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[35:38], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[48:51], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[15:18], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 +; GFX11-TRUE16-NEXT: s_clause 0x12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:72 +; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v76, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v76, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v76, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v76, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v76, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v76, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v76, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v76, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v76, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v75, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v75, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v75, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v75, 28 +; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v75, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v75, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v75, 25 +; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v75, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v75, 23 +; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v75, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v75, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v75, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v75, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v75, 18 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v75, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v75, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v75, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v75, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v75, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v75, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v75, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v75, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v75, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v75, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v75, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v75, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v75, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v75, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v75, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v75, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v75, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v75, 0 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:88 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:88 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s30, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s99, 3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s100, 4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s101, 5 +; GFX11-FAKE16-NEXT: s_mov_b32 s99, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s102, 6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s103, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s104, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s49, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s50, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s51, 11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s52, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s53, 13 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s54, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s55, 15 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s64, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s65, 17 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s66, 18 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s67, 19 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s68, 20 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s69, 21 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s70, 22 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s71, 23 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s80, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s81, 25 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s82, 26 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s83, 27 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s84, 28 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s85, 29 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s86, 30 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s87, 31 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB99_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[0:1], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s5, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s5, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s5, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s4, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s7, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s7, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s9, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s8, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s8, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s11, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s11, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s10, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s13, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s13, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s12, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s12, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s15, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s15, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s15, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s14, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s41, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s41, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s41, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s40, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s40, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s29, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s28, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[72:73], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[40:41], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s2, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s2, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s0, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s0, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s74, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s75, 1 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s99 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB99_4 +; GFX11-FAKE16-NEXT: .LBB99_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v39, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v38, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v51, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v50, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v33, s21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, s23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, s22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, s20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s41, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s40, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v53, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v52, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v37, s19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v36, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[38:39] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[28:29] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[32:33] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[50:51] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[20:21] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[24:25] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[36:37] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[52:53] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 24, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 24, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 8, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 8, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 24, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 8, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 16, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 8, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v15 +; GFX11-FAKE16-NEXT: s_branch .LBB99_5 +; GFX11-FAKE16-NEXT: .LBB99_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: s_mov_b32 s99, -1 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s43, 1 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: s_branch .LBB99_2 +; GFX11-FAKE16-NEXT: .LBB99_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s0 :: v_dual_mov_b32 v53, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v71, s50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v14, s41 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v74, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s14 :: v_dual_mov_b32 v12, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s12 :: v_dual_mov_b32 v10, s13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v73, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v55, s48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s10 :: v_dual_mov_b32 v8, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s8 :: v_dual_mov_b32 v6, s9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v72, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v49, s39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v4, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v62, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s2 :: v_dual_mov_b32 v51, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s16 :: v_dual_mov_b32 v39, s17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v63, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s38 :: v_dual_mov_b32 v36, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, s19 :: v_dual_mov_b32 v32, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s21 :: v_dual_mov_b32 v60, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_mov_b32 v29, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v61, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s26 :: v_dual_mov_b32 v21, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v146, s42 :: v_dual_mov_b32 v145, s104 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v59, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v144, vcc_hi :: v_dual_mov_b32 v135, s103 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v134, s102 :: v_dual_mov_b32 v133, s101 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v57, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s36 :: v_dual_mov_b32 v132, s98 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v131, s100 :: v_dual_mov_b32 v130, s97 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v129, s96 :: v_dual_mov_b32 v58, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s37 :: v_dual_mov_b32 v128, s87 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v119, s85 :: v_dual_mov_b32 v118, s86 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v117, s84 :: v_dual_mov_b32 v56, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v116, s83 :: v_dual_mov_b32 v115, s82 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v114, s80 :: v_dual_mov_b32 v113, s81 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v47, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s35 :: v_dual_mov_b32 v112, s71 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v103, s70 :: v_dual_mov_b32 v102, s69 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v101, s67 :: v_dual_mov_b32 v46, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s34 :: v_dual_mov_b32 v100, s68 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v99, s66 :: v_dual_mov_b32 v98, s65 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s64 :: v_dual_mov_b32 v44, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s54 :: v_dual_mov_b32 v87, s55 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v85, s52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v45, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, s51 :: v_dual_mov_b32 v83, s49 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v147, s43 :: v_dual_mov_b32 v22, s78 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v43, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v67, s58 :: v_dual_mov_b32 v26, s88 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v81, s44 :: v_dual_mov_b32 v30, s90 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v42, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s74 :: v_dual_mov_b32 v34, s92 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v65, s94 :: v_dual_mov_b32 v68, s30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v41, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v48, s62 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v54, s72 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v64, s60 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s56 :: v_dual_mov_b32 v183, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 21 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v80, s46 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v18, s76 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v40, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v182, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 23 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v181, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v180, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v178, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v179, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v177, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 28 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v176, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 29 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v167, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v165, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 31 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v166, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v164, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v163, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v162, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v160, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v161, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v151, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v150, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v149, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v148, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s1, v78, 1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v82, s0 +; GFX11-FAKE16-NEXT: .LBB99_5: ; %end +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v52, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v57, 0xff, v57 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v69, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v72 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v53, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v62 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v67 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v39, 0x300, v1 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v33, v38, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v32, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v34, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v52, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 -; GFX11-FAKE16-NEXT: .LBB48_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:580 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v69, v82 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v60, 0xff, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, v50, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, v60, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v52, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v59 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v52, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v82 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v66, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v57, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v52, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, v38, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v82, v80 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xff, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v53, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v39, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v81, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v38, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v80, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v82, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xff, v183 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v38, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v70, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v81, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v181 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, v80, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, v38, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v66, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v178 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v179 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v165 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v165, 8, v166 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, v67, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v80, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v82, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v69 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[50:53], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[36:39], off offset:16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v32, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v33, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, v28, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v29, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v163 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xff, v160 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 8, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v50, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xff, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 8, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v50, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, v53, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v20, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v21, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v15, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v25, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v25, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v33, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v13, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, v14, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v113 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v112 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v20, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v26, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v21, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v10, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v5, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v20, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v22, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v17 +; GFX11-FAKE16-NEXT: s_clause 0x5 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[36:39], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[50:53], off offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[64:67], off offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:72 +; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v76, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v76, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v76, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v76, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v76, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v76, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v76, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v76, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v76, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v75, 31 +; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v75, 30 +; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v75, 29 +; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v75, 28 +; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v75, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v75, 26 +; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v75, 25 +; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v75, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v75, 23 +; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v75, 22 +; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v75, 21 +; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v75, 20 +; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v75, 19 +; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v75, 18 +; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v75, 17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v75, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v75, 15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v75, 14 +; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v75, 13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v75, 12 +; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v75, 11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v75, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v75, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v75, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v75, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v75, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v75, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v75, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v75, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v75, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v75, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v75, 0 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <128 x i8> %a, splat (i8 3) - %a2 = bitcast <128 x i8> %a1 to <64 x i16> + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <128 x i8> br label %end cmp.false: - %a3 = bitcast <128 x i8> %a to <64 x i16> + %a3 = bitcast <64 x i16> %a to <128 x i8> br label %end end: - %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <64 x i16> %phi + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi } define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v64bf16_to_v64f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v44 -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v45 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v46 -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v57 -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v58 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v61 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v62 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: v_mov_b32_e32 v5, v4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v59 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v61 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v9 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v35 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v37 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v39 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v48 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v50 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v51 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v53 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v54 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v55 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v40 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v41 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v43 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v44 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v45 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v46 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v47 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v56 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v57 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v58 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v59 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v61 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v62 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v63 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: .LBB49_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v63 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v62 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v61 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v60 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v59 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v58 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v57 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v56 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v47 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v46 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v45 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v44 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v43 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v42 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v40, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v42, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v44, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v46, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v56, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v58, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v60, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v62, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v20 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v31 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v36 -; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v35 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v34 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v48 -; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v49 -; GCN-NEXT: v_add_f32_e32 v39, 0x40c00000, v50 -; GCN-NEXT: v_add_f32_e32 v48, 0x40c00000, v51 -; GCN-NEXT: v_add_f32_e32 v49, 0x40c00000, v52 -; GCN-NEXT: v_add_f32_e32 v50, 0x40c00000, v53 -; GCN-NEXT: v_add_f32_e32 v51, 0x40c00000, v54 -; GCN-NEXT: v_add_f32_e32 v52, 0x40c00000, v55 -; GCN-NEXT: v_add_f32_e32 v53, 0x40c00000, v40 -; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v41 -; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v42 -; GCN-NEXT: v_add_f32_e32 v40, 0x40c00000, v43 -; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v44 -; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v45 -; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v46 -; GCN-NEXT: v_add_f32_e32 v44, 0x40c00000, v47 -; GCN-NEXT: v_add_f32_e32 v45, 0x40c00000, v56 -; GCN-NEXT: v_add_f32_e32 v46, 0x40c00000, v57 -; GCN-NEXT: v_add_f32_e32 v47, 0x40c00000, v58 -; GCN-NEXT: v_add_f32_e32 v56, 0x40c00000, v59 -; GCN-NEXT: v_add_f32_e32 v57, 0x40c00000, v60 -; GCN-NEXT: v_add_f32_e32 v58, 0x40c00000, v61 -; GCN-NEXT: v_add_f32_e32 v59, 0x40c00000, v62 -; GCN-NEXT: v_add_f32_e32 v60, 0x40c00000, v63 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v61, 0x40c00000, v61 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v63, 0x40c00000, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v59 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v61 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v31 -; GCN-NEXT: v_mov_b32_e32 v31, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v29 -; GCN-NEXT: v_mov_b32_e32 v29, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v28, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v27, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v26, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v25, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v24, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v23, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v21, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v19, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v16, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v13, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v11, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v7, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GCN-NEXT: .LBB49_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v4, v2 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v45, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v44, v2, v1 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v2, v1 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v46, v2, v1 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v8, v8, v2 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v10, v10, v2 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v12, v12, v2 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v14, v14, v2 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v16, v16, v2 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v18, v18, v2 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v20, v20, v2 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v22, v2 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v25, v26, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v29, v30, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v32, v31 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v35, v36, v35 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v37, v38, v37 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v39, v48, v39 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v49, v50, v49 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v51, v52, v51 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v53, v54, v53 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v55, v40, v55 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v41, v42, v41 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v43, v3 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v45, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64bf16_to_v64f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v43 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v44 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v45 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v47 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v1 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v56 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v58 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v61 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: .LBB100_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: .LBB100_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v64f16: ; VI: ; %bb.0: @@ -106519,7 +227879,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB49_2 +; VI-NEXT: s_cbranch_execz .LBB100_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 @@ -107105,7 +228465,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 -; VI-NEXT: .LBB49_2: ; %end +; VI-NEXT: .LBB100_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -107126,1607 +228486,5513 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v64bf16_to_v64f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB49_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_add3_u32 v34, v34, v33, s6 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; GFX9-NEXT: v_bfe_u32 v34, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v34, v34, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v18 -; GFX9-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX9-NEXT: v_bfe_u32 v35, v34, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_add3_u32 v35, v35, v34, s6 -; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 -; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc -; GFX9-NEXT: v_bfe_u32 v35, v18, 16, 1 -; GFX9-NEXT: v_add3_u32 v35, v35, v18, s6 -; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v35, 16, v19 -; GFX9-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_add3_u32 v36, v36, v35, s6 -; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 -; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc -; GFX9-NEXT: v_bfe_u32 v36, v19, 16, 1 -; GFX9-NEXT: v_add3_u32 v36, v36, v19, s6 -; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX9-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_add3_u32 v37, v37, v36, s6 -; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 -; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc -; GFX9-NEXT: v_bfe_u32 v37, v20, 16, 1 -; GFX9-NEXT: v_add3_u32 v37, v37, v20, s6 -; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; GFX9-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX9-NEXT: v_bfe_u32 v38, v37, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_add3_u32 v38, v38, v37, s6 -; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 -; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc -; GFX9-NEXT: v_bfe_u32 v38, v21, 16, 1 -; GFX9-NEXT: v_add3_u32 v38, v38, v21, s6 -; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 16, v22 -; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_add3_u32 v39, v39, v38, s6 -; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v38 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 -; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc -; GFX9-NEXT: v_bfe_u32 v39, v22, 16, 1 -; GFX9-NEXT: v_add3_u32 v39, v39, v22, s6 -; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; GFX9-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX9-NEXT: v_bfe_u32 v48, v39, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_add3_u32 v48, v48, v39, s6 -; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v39 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 -; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc -; GFX9-NEXT: v_bfe_u32 v48, v23, 16, 1 -; GFX9-NEXT: v_add3_u32 v48, v48, v23, s6 -; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v48, 16, v24 -; GFX9-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX9-NEXT: v_bfe_u32 v49, v48, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_add3_u32 v49, v49, v48, s6 -; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v48 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 -; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc -; GFX9-NEXT: v_bfe_u32 v49, v24, 16, 1 -; GFX9-NEXT: v_add3_u32 v49, v49, v24, s6 -; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v49, 16, v25 -; GFX9-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; GFX9-NEXT: v_bfe_u32 v50, v49, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_add3_u32 v50, v50, v49, s6 -; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v49 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 -; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc -; GFX9-NEXT: v_bfe_u32 v50, v25, 16, 1 -; GFX9-NEXT: v_add3_u32 v50, v50, v25, s6 -; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v50, 16, v26 -; GFX9-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 -; GFX9-NEXT: v_bfe_u32 v51, v50, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_add3_u32 v51, v51, v50, s6 -; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 -; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc -; GFX9-NEXT: v_bfe_u32 v51, v26, 16, 1 -; GFX9-NEXT: v_add3_u32 v51, v51, v26, s6 -; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v51, 16, v27 -; GFX9-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 -; GFX9-NEXT: v_bfe_u32 v52, v51, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_add3_u32 v52, v52, v51, s6 -; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v51 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 -; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc -; GFX9-NEXT: v_bfe_u32 v52, v27, 16, 1 -; GFX9-NEXT: v_add3_u32 v52, v52, v27, s6 -; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v52, 16, v28 -; GFX9-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 -; GFX9-NEXT: v_bfe_u32 v53, v52, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_add3_u32 v53, v53, v52, s6 -; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v52 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 -; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc -; GFX9-NEXT: v_bfe_u32 v53, v28, 16, 1 -; GFX9-NEXT: v_add3_u32 v53, v53, v28, s6 -; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v53, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 -; GFX9-NEXT: v_bfe_u32 v54, v53, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_add3_u32 v54, v54, v53, s6 -; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v53 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 -; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc -; GFX9-NEXT: v_bfe_u32 v54, v29, 16, 1 -; GFX9-NEXT: v_add3_u32 v54, v54, v29, s6 -; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v54, 16, v30 -; GFX9-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 -; GFX9-NEXT: v_bfe_u32 v55, v54, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_add3_u32 v55, v55, v54, s6 -; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v54 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 -; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc -; GFX9-NEXT: v_bfe_u32 v55, v30, 16, 1 -; GFX9-NEXT: v_add3_u32 v55, v55, v30, s6 -; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31 -; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 -; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX9-NEXT: v_add3_u32 v40, v40, v55, s6 -; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v55 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 -; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc -; GFX9-NEXT: v_bfe_u32 v40, v31, 16, 1 -; GFX9-NEXT: v_add3_u32 v40, v40, v31, s6 -; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v40, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 -; GFX9-NEXT: v_bfe_u32 v41, v40, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_add3_u32 v41, v41, v40, s6 -; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v40 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 -; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc -; GFX9-NEXT: v_bfe_u32 v41, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v41, v41, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v41, v42, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v41, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v41, 0x40c00000, v41 -; GFX9-NEXT: v_bfe_u32 v42, v41, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_add3_u32 v42, v42, v41, s6 -; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v41 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v41, v41 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v41, v42, v43, vcc -; GFX9-NEXT: v_bfe_u32 v42, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v42, v42, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v1 +; GFX9-LABEL: bitcast_v64bf16_to_v64f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB100_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_add3_u32 v34, v34, v33, s6 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v34, v34, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX9-NEXT: v_bfe_u32 v35, v34, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_add3_u32 v35, v35, v34, s6 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc +; GFX9-NEXT: v_bfe_u32 v35, v18, 16, 1 +; GFX9-NEXT: v_add3_u32 v35, v35, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v35, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX9-NEXT: v_add3_u32 v36, v36, v35, s6 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc +; GFX9-NEXT: v_bfe_u32 v36, v19, 16, 1 +; GFX9-NEXT: v_add3_u32 v36, v36, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX9-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX9-NEXT: v_add3_u32 v37, v37, v36, s6 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc +; GFX9-NEXT: v_bfe_u32 v37, v20, 16, 1 +; GFX9-NEXT: v_add3_u32 v37, v37, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX9-NEXT: v_bfe_u32 v38, v37, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX9-NEXT: v_add3_u32 v38, v38, v37, s6 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc +; GFX9-NEXT: v_bfe_u32 v38, v21, 16, 1 +; GFX9-NEXT: v_add3_u32 v38, v38, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX9-NEXT: v_add3_u32 v39, v39, v38, s6 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v38 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc +; GFX9-NEXT: v_bfe_u32 v39, v22, 16, 1 +; GFX9-NEXT: v_add3_u32 v39, v39, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX9-NEXT: v_bfe_u32 v48, v39, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX9-NEXT: v_add3_u32 v48, v48, v39, s6 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v39 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc +; GFX9-NEXT: v_bfe_u32 v48, v23, 16, 1 +; GFX9-NEXT: v_add3_u32 v48, v48, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX9-NEXT: v_bfe_u32 v49, v48, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX9-NEXT: v_add3_u32 v49, v49, v48, s6 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v48 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc +; GFX9-NEXT: v_bfe_u32 v49, v24, 16, 1 +; GFX9-NEXT: v_add3_u32 v49, v49, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; GFX9-NEXT: v_bfe_u32 v50, v49, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX9-NEXT: v_add3_u32 v50, v50, v49, s6 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v49 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc +; GFX9-NEXT: v_bfe_u32 v50, v25, 16, 1 +; GFX9-NEXT: v_add3_u32 v50, v50, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; GFX9-NEXT: v_bfe_u32 v51, v50, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX9-NEXT: v_add3_u32 v51, v51, v50, s6 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc +; GFX9-NEXT: v_bfe_u32 v51, v26, 16, 1 +; GFX9-NEXT: v_add3_u32 v51, v51, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; GFX9-NEXT: v_bfe_u32 v52, v51, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX9-NEXT: v_add3_u32 v52, v52, v51, s6 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v51 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc +; GFX9-NEXT: v_bfe_u32 v52, v27, 16, 1 +; GFX9-NEXT: v_add3_u32 v52, v52, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v52, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; GFX9-NEXT: v_bfe_u32 v53, v52, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_add3_u32 v53, v53, v52, s6 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc +; GFX9-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX9-NEXT: v_add3_u32 v53, v53, v28, s6 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX9-NEXT: v_bfe_u32 v54, v53, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_add3_u32 v54, v54, v53, s6 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc +; GFX9-NEXT: v_bfe_u32 v54, v29, 16, 1 +; GFX9-NEXT: v_add3_u32 v54, v54, v29, s6 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v54, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; GFX9-NEXT: v_bfe_u32 v55, v54, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX9-NEXT: v_add3_u32 v55, v55, v54, s6 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; GFX9-NEXT: v_bfe_u32 v55, v30, 16, 1 +; GFX9-NEXT: v_add3_u32 v55, v55, v30, s6 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31 +; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX9-NEXT: v_add3_u32 v40, v40, v55, s6 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v55 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc +; GFX9-NEXT: v_bfe_u32 v40, v31, 16, 1 +; GFX9-NEXT: v_add3_u32 v40, v40, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 +; GFX9-NEXT: v_bfe_u32 v41, v40, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v41, v41, v40, s6 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v40 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc +; GFX9-NEXT: v_bfe_u32 v41, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v41, v41, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v41, v42, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v41, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v41, 0x40c00000, v41 +; GFX9-NEXT: v_bfe_u32 v42, v41, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v42, v42, v41, s6 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v41 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v41, v41 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v41, v42, v43, vcc +; GFX9-NEXT: v_bfe_u32 v42, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v42, v42, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v16, v42, v43, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v42, 0x40c00000, v42 +; GFX9-NEXT: v_bfe_u32 v43, v42, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v43, v43, v42, s6 +; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v42 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v42, v42 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v42, v43, v44, vcc +; GFX9-NEXT: v_bfe_u32 v43, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v43, v43, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v43, v44, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v43, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v43, 0x40c00000, v43 +; GFX9-NEXT: v_bfe_u32 v44, v43, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v44, v44, v43, s6 +; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v43 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v43, v43 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v43, v44, v45, vcc +; GFX9-NEXT: v_bfe_u32 v44, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v44, v44, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v44, v45, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v44, 0x40c00000, v44 +; GFX9-NEXT: v_bfe_u32 v45, v44, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v45, v45, v44, s6 +; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v44 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v44, v44 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v44, v45, v46, vcc +; GFX9-NEXT: v_bfe_u32 v45, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v45, v45, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v45, v46, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v45, 0x40c00000, v45 +; GFX9-NEXT: v_bfe_u32 v46, v45, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v46, v46, v45, s6 +; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v45 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v45, v45 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v45, v46, v47, vcc +; GFX9-NEXT: v_bfe_u32 v46, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v46, v46, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v46, v47, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v46, 0x40c00000, v46 +; GFX9-NEXT: v_bfe_u32 v47, v46, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v47, v47, v46, s6 +; GFX9-NEXT: v_or_b32_e32 v56, 0x400000, v46 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v46, v46 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v46, v47, v56, vcc +; GFX9-NEXT: v_bfe_u32 v47, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v47, v47, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v56, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v47, v56, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v47, 0x40c00000, v47 +; GFX9-NEXT: v_bfe_u32 v56, v47, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v56, v56, v47, s6 +; GFX9-NEXT: v_or_b32_e32 v57, 0x400000, v47 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v47, v47 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v47, v56, v57, vcc +; GFX9-NEXT: v_bfe_u32 v56, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v56, v56, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v57, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v56, v57, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v56, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v56, 0x40c00000, v56 +; GFX9-NEXT: v_bfe_u32 v57, v56, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v57, v57, v56, s6 +; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v56 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v56, v56 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v56, v57, v58, vcc +; GFX9-NEXT: v_bfe_u32 v57, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v57, v57, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v57, v58, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v57, 0x40c00000, v57 +; GFX9-NEXT: v_bfe_u32 v58, v57, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v58, v58, v57, s6 +; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v57 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v57, v57 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v57, v58, v59, vcc +; GFX9-NEXT: v_bfe_u32 v58, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v58, v58, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v58, v59, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v58, 0x40c00000, v58 +; GFX9-NEXT: v_bfe_u32 v59, v58, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v59, v59, v58, s6 +; GFX9-NEXT: v_or_b32_e32 v60, 0x400000, v58 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v58, v58 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v58, v59, v60, vcc +; GFX9-NEXT: v_bfe_u32 v59, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v59, v59, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v60, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v59, v60, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v59, 0x40c00000, v59 +; GFX9-NEXT: v_bfe_u32 v60, v59, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v60, v60, v59, s6 +; GFX9-NEXT: v_or_b32_e32 v61, 0x400000, v59 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v59, v59 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v59, v60, v61, vcc +; GFX9-NEXT: v_bfe_u32 v60, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v60, v60, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v61, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v60, v61, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v60, 0x40c00000, v60 +; GFX9-NEXT: v_bfe_u32 v61, v60, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v61, v61, v60, s6 +; GFX9-NEXT: v_or_b32_e32 v62, 0x400000, v60 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v60, v60 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v60, v61, v62, vcc +; GFX9-NEXT: v_bfe_u32 v61, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v61, v61, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v62, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v61, v62, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v61, 0x40c00000, v61 +; GFX9-NEXT: v_bfe_u32 v62, v61, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v62, v62, v61, s6 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v61 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v61, v61 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v61, v62, v63, vcc +; GFX9-NEXT: v_bfe_u32 v62, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v62, v62, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v62, v63, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_add3_u32 v63, v63, v62, s6 +; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v62 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; GFX9-NEXT: v_bfe_u32 v62, v14, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v63, v0, vcc +; GFX9-NEXT: v_add3_u32 v62, v62, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v62, v63, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_add3_u32 v63, v63, v62, s6 +; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v62 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; GFX9-NEXT: v_bfe_u32 v62, v15, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v63, v0, vcc +; GFX9-NEXT: v_add3_u32 v62, v62, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v62, v63, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v0, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v1, s6 +; GFX9-NEXT: v_perm_b32 v1, v16, v41, s6 +; GFX9-NEXT: v_perm_b32 v0, v17, v40, s6 +; GFX9-NEXT: v_perm_b32 v17, v32, v33, s6 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v13, v13, v61, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v60, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v59, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v58, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v57, s6 +; GFX9-NEXT: v_perm_b32 v8, v8, v56, s6 +; GFX9-NEXT: v_perm_b32 v7, v7, v47, s6 +; GFX9-NEXT: v_perm_b32 v6, v6, v46, s6 +; GFX9-NEXT: v_perm_b32 v5, v5, v45, s6 +; GFX9-NEXT: v_perm_b32 v4, v4, v44, s6 +; GFX9-NEXT: v_perm_b32 v3, v3, v43, s6 +; GFX9-NEXT: v_perm_b32 v2, v2, v42, s6 +; GFX9-NEXT: v_perm_b32 v31, v31, v55, s6 +; GFX9-NEXT: v_perm_b32 v30, v30, v54, s6 +; GFX9-NEXT: v_perm_b32 v29, v29, v53, s6 +; GFX9-NEXT: v_perm_b32 v28, v28, v52, s6 +; GFX9-NEXT: v_perm_b32 v27, v27, v51, s6 +; GFX9-NEXT: v_perm_b32 v26, v26, v50, s6 +; GFX9-NEXT: v_perm_b32 v25, v25, v49, s6 +; GFX9-NEXT: v_perm_b32 v24, v24, v48, s6 +; GFX9-NEXT: v_perm_b32 v23, v23, v39, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v38, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v37, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v36, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v35, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v34, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v16, v32, v16, s6 +; GFX9-NEXT: .LBB100_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB100_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v71, 0x40c00000, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff0000, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff0000, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff0000, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v21 +; GFX11-TRUE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v18 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v16, v16, v32, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v24 +; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v26 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v17, v17, v39 :: v_dual_lshlrev_b32 v26, 16, v26 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v34, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v29 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v37, v32 :: v_dual_lshlrev_b32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v37, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v80, 0x40c00000, v80 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_lshlrev_b32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v36, v38, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_add_f32 v39, 0x40c00000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v19, v34, v35 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v36, v37, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v39, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v82, 0x40c00000, v82 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_lshlrev_b32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v36, v39, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v39 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v49 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v35, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v38, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v48, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v21 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v84, 0x40c00000, v84 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v37, v48, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v48 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v49, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_lshlrev_b32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v86, 0x40c00000, v86 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v37, v38, v39, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v39 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_add_f32 v87, 0x40c00000, v87 :: v_dual_lshlrev_b32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v96, 0x40c00000, v96 :: v_dual_cndmask_b32 v21, v37, v38 +; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v49, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v49 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v15 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v22, v37, v38 :: v_dual_add_f32 v49, 0x40c00000, v51 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v50, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v50 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v51, 0x40c00000, v23 :: v_dual_add_f32 v14, 0x40c00000, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v50, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v48, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v49, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v98, 0x40c00000, v98 +; GFX11-TRUE16-NEXT: v_bfe_u32 v101, v14, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v39, v48, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v48 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-TRUE16-NEXT: v_add3_u32 v101, v101, v14, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v102, v98, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v114, 0x400000, v98 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v38, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v49, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v49 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v52 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v51, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-TRUE16-NEXT: v_add3_u32 v102, v102, v98, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v50, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v39, v48, v51, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v51 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v25 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v52, 0x40c00000, v24 :: v_dual_lshlrev_b32 v25, 16, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v39, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v39, v49, v50, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v50 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v53 +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v52, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v51, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v48, v49, v52, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v52 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v52, 0x40c00000, v54 :: v_dual_add_f32 v53, 0x40c00000, v25 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v26 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v25, v48, v49 :: v_dual_and_b32 v26, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_add3_u32 v48, v50, v51, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v51 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v53, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v52, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v49, v50, v53, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v53 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v54, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v50, v51, v52, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v52 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v49.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v50, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v50, v53, v54, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v54 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v55, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v54, 0x40c00000, v64 :: v_dual_add_f32 v53, 0x40c00000, v27 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v50, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v50, v52, v55, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v55 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v53, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v54, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v28 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v51, v52, v53, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v53 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v51, v52, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v51, v55, v54, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v54 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v65 +; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v64, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v54, v55, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v52, v53, v64, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v64 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v30 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v65, 0x40c00000, v29 :: v_dual_lshlrev_b32 v30, 16, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v52, v53, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v52, v54, v55, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v55 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v66 +; GFX11-TRUE16-NEXT: v_bfe_u32 v54, v65, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v64, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v53, v54, v65, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v65 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v65, 0x40c00000, v67 :: v_dual_add_f32 v66, 0x40c00000, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v30, v53, v54 :: v_dual_and_b32 v31, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v53, v55, v64, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v64 +; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v66, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 +; GFX11-TRUE16-NEXT: v_bfe_u32 v64, v65, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v54, v55, v66, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v66 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-TRUE16-NEXT: v_bfe_u32 v66, v67, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v55, v64, v65, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v65 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v68, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v54.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v55, v64, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v55, v66, v67, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v67 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v68, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v67, 0x40c00000, v69 :: v_dual_add_f32 v66, 0x40c00000, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v55, v64, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v55, v65, v68, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v68 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68 +; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v66, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v68, v67, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v69, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v55, v64, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v64, v65, v66, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v66 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v70, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v64, v64, v65, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v65, v68, v67, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v68, v69, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-TRUE16-NEXT: v_bfe_u32 v67, v70, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v64.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v65, v66, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v65, v68, v69, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v69 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v69, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v68, 0x40c00000, v68 :: v_dual_cndmask_b32 v65, v65, v66 +; GFX11-TRUE16-NEXT: v_add3_u32 v66, v67, v70, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v70 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 +; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v68, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v65.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v66, v66, v67, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v67, v69, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v65, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v28.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v67, v69, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v67, v70, v68, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v68 +; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v68, v70, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v67, v69, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v71, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v27.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v65, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v25.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v68, v69, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v68, v70, v71, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v71 +; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71 +; GFX11-TRUE16-NEXT: v_bfe_u32 v71, v80, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v49, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v17.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v68, v69, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v69, v70, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v67 +; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v66, v26 +; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v48, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v69, v70 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v69, v71, v80, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v80 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80 +; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v81, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v71, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v69, v69, v70, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v70, v71, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v68 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v0.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v64, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v30.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v70, v71, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v70, v80, v81, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v81 +; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 +; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v82, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v29.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v55, v52 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v23.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v70, v71, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v71, v80, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v64, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v24.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v71, v80 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_add3_u32 v71, v81, v82, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v82 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 +; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v83, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v54, v53 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v71, v80, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_add3_u32 v80, v81, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v22.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v50, v39 +; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v51, v38 +; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v52, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v80, v81, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v80, v82, v83, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v83 +; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 +; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v84, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v19.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v80, v81, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v81, v82, v8, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v18.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v71 +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v70 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v81, v82 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-TRUE16-NEXT: v_add3_u32 v81, v83, v84, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v84 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 +; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v85, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v69 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v81, v82, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add3_u32 v82, v83, v9, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v80 +; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v68, v31 +; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v53, v21 +; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v36, v35 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v82, v83, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v82, v84, v85, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v85 +; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 +; GFX11-TRUE16-NEXT: v_bfe_u32 v85, v86, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v37, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v38, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v82, v83, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v83, v84, v10, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v81 +; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v39, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v10, v83, v84 :: v_dual_add_f32 v11, 0x40c00000, v11 +; GFX11-TRUE16-NEXT: v_add3_u32 v83, v85, v86, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v86 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 +; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v87, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v85, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v83, v84, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v84, v85, v11, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v82 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v84, v85, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v84, v86, v87, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v87 +; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 +; GFX11-TRUE16-NEXT: v_bfe_u32 v87, v96, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v84, v85, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v85, v86, v12, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v83 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v12, v85, v86 :: v_dual_add_f32 v13, 0x40c00000, v13 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-TRUE16-NEXT: v_add3_u32 v85, v87, v96, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v14, v101, v112 :: v_dual_add_f32 v87, 0x40c00000, v97 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v86, v86, v13, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_bfe_u32 v99, v87, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v98, v102, v114 :: v_dual_add_f32 v15, 0x40c00000, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v103, 0x400000, v87 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 +; GFX11-TRUE16-NEXT: v_add3_u32 v99, v99, v87, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v97, 0x400000, v96 +; GFX11-TRUE16-NEXT: v_bfe_u32 v113, v15, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v98.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v99, v103, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v101, v113, v15, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v84 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v101, v112, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v87 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v98, v15 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v86, v100, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v85, v97, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v85 +; GFX11-TRUE16-NEXT: .LBB100_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v64f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB100_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v69, 0x40c00000, v69 :: v_dual_lshlrev_b32 v70, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 16, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v10 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v87, 0x40c00000, v87 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v16, v16, v32, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 16, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v17 +; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v11 +; GFX11-FAKE16-NEXT: v_add3_u32 v17, v17, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v17, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v34, v36, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v37, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v37, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v33, v17, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v34, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v37, v38, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v39, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v39, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v39 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v19 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v20 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v38, v37, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v48, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v38, v48, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v49, 0x40c00000, v20 :: v_dual_add_f32 v48, 0x40c00000, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v36, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v38, v39, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v39 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v49, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v48, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v21 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v35, v19, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v49, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v49 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v49, 0x40c00000, v22 :: v_dual_lshlrev_b32 v22, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v37, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v39, v48, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v49, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v50, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v37, v21, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v50, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v38, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v38, v48, v49, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v49 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v51, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v50, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v39, v48, v51, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v51 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v52, 0x40c00000, v23 :: v_dual_add_f32 v51, 0x40c00000, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v39, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v39, v49, v50, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v51, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v52, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v39, v23, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v48, v49, v52, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v52 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v25 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v48, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v48, v50, v51, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v51 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v53, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v52, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v50, v53, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v53 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v54, 0x40c00000, v25 :: v_dual_add_f32 v53, 0x40c00000, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v49, v50, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v51, v52, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v52 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v53, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v48, v24, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v54, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v49, v25, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v50, v51, v54, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v54 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v27 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v50, v51, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v50, v52, v53, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v53 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v55, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v54, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v52, v55, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v55 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v64, 0x40c00000, v27 :: v_dual_add_f32 v55, 0x40c00000, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v51, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v53, v54, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v54 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-FAKE16-NEXT: v_bfe_u32 v54, v55, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v65, 0x40c00000, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v50, v26, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v64, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v51, v27, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v53, v64, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v64 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v29 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v52, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v54, v55, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v55 +; GFX11-FAKE16-NEXT: v_bfe_u32 v54, v65, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v64, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v53, v54, v65, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v65 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v66, 0x40c00000, v29 :: v_dual_add_f32 v65, 0x40c00000, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v53, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v53, v55, v64, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v64 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 +; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v65, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v52, v28, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v66, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v53, v29, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v54, v55, v66, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v66 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v66, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v54, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v54, v64, v65, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v65 +; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v67, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v66, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v55, v64, v67, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v67 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v68, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v67, 16, v1 +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v54, v30, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v55, v64, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v67 +; GFX11-FAKE16-NEXT: v_add3_u32 v55, v65, v66, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v66 +; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v68, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v0, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v55, v64, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v64, v65, v68, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v68 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68 +; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v67, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v55, v31, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v64, v65, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v65, v66, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v65, v66 :: v_dual_and_b32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v65, v68, v67, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v67 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v64, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v65, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v66, v68, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v69, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v66, v67, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v66, v68, v69, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v69 +; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v65, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v66, v67, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v67, v68, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v67, v68 :: v_dual_and_b32 v3, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v70, 0x40c00000, v70 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v66, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v69, v70, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v70 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 +; GFX11-FAKE16-NEXT: v_add3_u32 v67, v69, v70, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v69, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v67, v68, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v68, v69, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v68, v69 :: v_dual_and_b32 v4, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v71, 0x40c00000, v71 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v67, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v70, v71, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, 0x400000, v71 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v68, v70, v71, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v70, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v68, v68, v69, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v69, v70, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v70, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v69, v70 :: v_dual_and_b32 v5, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v68, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v80, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v70, 0x400000, v80 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80 +; GFX11-FAKE16-NEXT: v_add3_u32 v69, v71, v80, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v69, v69, v70, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v70, v71, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v70, v71 :: v_dual_and_b32 v6, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v69, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v80, v81, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v81 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v70, v80, v81, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v80, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v70, v71, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v71, v80, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v71, v80 :: v_dual_and_b32 v7, 0xffff0000, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v82, 0x40c00000, v82 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v70, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v81, v82, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v82 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 +; GFX11-FAKE16-NEXT: v_add3_u32 v71, v81, v82, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v81, v7, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v71, v71, v80, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v80, v81, v7, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v80, v81 :: v_dual_and_b32 v8, 0xffff0000, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v71, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v83, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, 0x400000, v83 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v80, v82, v83, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v80, v80, v81, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v81, v82, v8, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v81, v82 :: v_dual_and_b32 v9, 0xffff0000, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v84, 0x40c00000, v84 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v80, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v83, v84, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v84 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 +; GFX11-FAKE16-NEXT: v_add3_u32 v81, v83, v84, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v83, v9, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v81, v81, v82, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v82, v83, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v83, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v82, v83 :: v_dual_and_b32 v10, 0xffff0000, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_add_f32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v81, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v84, v85, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v83, 0x400000, v85 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v82, v84, v85, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v84, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v82, v82, v83, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v83, v84, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v83, v84 :: v_dual_and_b32 v11, 0xffff0000, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v86, 0x40c00000, v86 :: v_dual_add_f32 v11, 0x40c00000, v11 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v82, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v86, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v86 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_add3_u32 v83, v85, v86, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v11, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v83, v83, v84 :: v_dual_lshlrev_b32 v84, 16, v13 +; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v87, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v97, 0x400000, v87 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v84, 0x40c00000, v84 +; GFX11-FAKE16-NEXT: v_add3_u32 v86, v86, v87, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-FAKE16-NEXT: v_bfe_u32 v98, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v84, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v85, v85, v11, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v86, v97, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v97, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add3_u32 v87, v98, v12, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v98, v99, v84, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 16, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v84 +; GFX11-FAKE16-NEXT: v_bfe_u32 v101, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v87, v97 :: v_dual_add_f32 v87, 0x40c00000, v99 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v97, v101, v13, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v86, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v87, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v84, v98, v100, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 16, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v101, 0x400000, v87 +; GFX11-FAKE16-NEXT: v_bfe_u32 v102, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v99, v99, v87, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v98, 0x40c00000, v98 :: v_dual_cndmask_b32 v87, v99, v101 +; GFX11-FAKE16-NEXT: v_add3_u32 v101, v102, v14, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-FAKE16-NEXT: v_bfe_u32 v103, v98, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v98 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v101, v102 :: v_dual_add_f32 v15, 0x40c00000, v15 +; GFX11-FAKE16-NEXT: v_add3_u32 v103, v103, v98, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v87, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v113, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v98, v103, v112, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v99, v99, v15, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v99, v113, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v98, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v97, v100, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v84, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v85, v96, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v83, 0x7060302 +; GFX11-FAKE16-NEXT: .LBB100_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64bf16_to_v64f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s28 +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: v_mov_b32_e32 v55, v50 +; SI-NEXT: v_mov_b32_e32 v40, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v24, v47 +; SI-NEXT: v_mov_b32_e32 v23, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v25, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v26, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_mov_b32_e32 v35, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v38, v10 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v19, v28 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v39, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_mov_b32_e32 v47, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v49 +; SI-NEXT: v_mov_b32_e32 v49, v15 +; SI-NEXT: v_mov_b32_e32 v15, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v50 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_mov_b32_e32 v51, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_mov_b32_e32 v37, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 +; SI-NEXT: v_mov_b32_e32 v27, v58 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v54 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: .LBB101_2: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v38, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v43 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v25 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v26 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v39, v54 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v48 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v43, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v51 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v43, v14 +; SI-NEXT: .LBB101_3: ; %end +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v53 +; SI-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v49 +; SI-NEXT: v_add_i32_e32 v16, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v18 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v39 +; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v35 +; SI-NEXT: v_add_i32_e32 v16, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 +; SI-NEXT: v_add_i32_e32 v16, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v21 +; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v36 +; SI-NEXT: v_add_i32_e32 v16, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 +; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v47 +; SI-NEXT: v_add_i32_e32 v16, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v59 +; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v60 +; SI-NEXT: v_add_i32_e32 v16, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 +; SI-NEXT: v_add_i32_e32 v16, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v62 +; SI-NEXT: v_add_i32_e32 v16, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v44 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v46 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB101_4: +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v40, v52 +; SI-NEXT: v_mov_b32_e32 v55, v50 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mov_b32_e32 v27, v58 +; SI-NEXT: v_mov_b32_e32 v26, v57 +; SI-NEXT: v_mov_b32_e32 v25, v56 +; SI-NEXT: v_mov_b32_e32 v24, v47 +; SI-NEXT: v_mov_b32_e32 v23, v46 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: s_branch .LBB101_2 +; +; VI-LABEL: bitcast_v64bf16_to_v64f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v42, s30, 0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_writelane_b32 v42, s31, 1 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s31, v1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB101_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB101_4 +; VI-NEXT: .LBB101_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_alignbit_b32 v15, v4, v3, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v18, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v18, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v2 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v18, v33, vcc +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v18, v1, 16 +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v17 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 +; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc +; VI-NEXT: v_bfe_u32 v35, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v32 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_cndmask_b32_e32 v32, v35, v36, vcc +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 +; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc +; VI-NEXT: v_bfe_u32 v36, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v19 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; VI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; VI-NEXT: v_bfe_u32 v37, v36, 16, 1 +; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v36 +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc +; VI-NEXT: v_bfe_u32 v37, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v20 +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 +; VI-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; VI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; VI-NEXT: v_bfe_u32 v38, v37, 16, 1 +; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v37 +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc +; VI-NEXT: v_bfe_u32 v38, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v21 +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 +; VI-NEXT: v_or_b32_e32 v39, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 +; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v38 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v48, 0x400000, v38 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc +; VI-NEXT: v_bfe_u32 v39, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v22 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 +; VI-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; VI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; VI-NEXT: v_bfe_u32 v48, v39, 16, 1 +; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v39 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v49, 0x400000, v39 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc +; VI-NEXT: v_bfe_u32 v48, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v23 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 +; VI-NEXT: v_or_b32_e32 v49, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; VI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; VI-NEXT: v_bfe_u32 v49, v48, 16, 1 +; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v48 +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v50, 0x400000, v48 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc +; VI-NEXT: v_bfe_u32 v49, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v24 +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 +; VI-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v25 +; VI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; VI-NEXT: v_bfe_u32 v50, v49, 16, 1 +; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v49 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v51, 0x400000, v49 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc +; VI-NEXT: v_bfe_u32 v50, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v25 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 +; VI-NEXT: v_or_b32_e32 v51, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v26 +; VI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; VI-NEXT: v_bfe_u32 v51, v50, 16, 1 +; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v50 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc +; VI-NEXT: v_bfe_u32 v51, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v26 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; VI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; VI-NEXT: v_bfe_u32 v52, v51, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v51 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v51 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc +; VI-NEXT: v_bfe_u32 v52, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v27 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v28 +; VI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; VI-NEXT: v_bfe_u32 v53, v52, 16, 1 +; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v52 +; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc +; VI-NEXT: v_bfe_u32 v53, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v28 +; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 +; VI-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; VI-NEXT: v_bfe_u32 v54, v53, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v53 +; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc +; VI-NEXT: v_bfe_u32 v54, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v29 +; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 +; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 +; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; VI-NEXT: v_bfe_u32 v55, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v30 +; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 +; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v55 +; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v55 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc +; VI-NEXT: v_bfe_u32 v40, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v31 +; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v31, v31, v55, 16 +; VI-NEXT: v_alignbit_b32 v30, v30, v54, 16 +; VI-NEXT: v_alignbit_b32 v29, v29, v53, 16 +; VI-NEXT: v_alignbit_b32 v28, v28, v52, 16 +; VI-NEXT: v_alignbit_b32 v27, v27, v51, 16 +; VI-NEXT: v_alignbit_b32 v26, v26, v50, 16 +; VI-NEXT: v_alignbit_b32 v25, v25, v49, 16 +; VI-NEXT: v_alignbit_b32 v24, v24, v48, 16 +; VI-NEXT: v_alignbit_b32 v23, v23, v39, 16 +; VI-NEXT: v_alignbit_b32 v22, v22, v38, 16 +; VI-NEXT: v_alignbit_b32 v21, v21, v37, 16 +; VI-NEXT: v_alignbit_b32 v20, v20, v36, 16 +; VI-NEXT: v_alignbit_b32 v19, v19, v35, 16 +; VI-NEXT: v_alignbit_b32 v32, v32, v34, 16 +; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16 +; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: s_branch .LBB101_5 +; VI-NEXT: .LBB101_3: +; VI-NEXT: s_branch .LBB101_2 +; VI-NEXT: .LBB101_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: .LBB101_5: ; %end +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_readlane_b32 s31, v42, 1 +; VI-NEXT: v_readlane_b32 s30, v42, 0 +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v64f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v43, s30, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_writelane_b32 v43, s31, 1 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s31, v1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB101_4 +; GFX9-NEXT: .LBB101_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v16, v42, v43, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v42, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v42, 0x40c00000, v42 -; GFX9-NEXT: v_bfe_u32 v43, v42, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_add3_u32 v43, v43, v42, s6 -; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v42 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v42, v42 -; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v42, v43, v44, vcc -; GFX9-NEXT: v_bfe_u32 v43, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v43, v43, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v2 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v43, v44, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v43, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v43, 0x40c00000, v43 -; GFX9-NEXT: v_bfe_u32 v44, v43, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_add3_u32 v44, v44, v43, s6 -; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v43 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v43, v43 -; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v43, v44, v45, vcc -; GFX9-NEXT: v_bfe_u32 v44, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v44, v44, v3, s6 -; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v3 +; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v44, v45, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v44, 0x40c00000, v44 -; GFX9-NEXT: v_bfe_u32 v45, v44, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_add3_u32 v45, v45, v44, s6 -; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v44 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v44, v44 -; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v44, v45, v46, vcc -; GFX9-NEXT: v_bfe_u32 v45, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v45, v45, v4, s6 -; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v4 +; GFX9-NEXT: s_lshl_b32 s4, s31, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v45, v46, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v45, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v45, 0x40c00000, v45 -; GFX9-NEXT: v_bfe_u32 v46, v45, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_add3_u32 v46, v46, v45, s6 -; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v45 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v45, v45 -; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v45, v46, v47, vcc -; GFX9-NEXT: v_bfe_u32 v46, v5, 16, 1 -; GFX9-NEXT: v_add3_u32 v46, v46, v5, s6 -; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v46, v47, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v46, 0x40c00000, v46 -; GFX9-NEXT: v_bfe_u32 v47, v46, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_add3_u32 v47, v47, v46, s6 -; GFX9-NEXT: v_or_b32_e32 v56, 0x400000, v46 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v46, v46 -; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v46, v47, v56, vcc -; GFX9-NEXT: v_bfe_u32 v47, v6, 16, 1 -; GFX9-NEXT: v_add3_u32 v47, v47, v6, s6 -; GFX9-NEXT: v_or_b32_e32 v56, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v47, v56, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v47, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v47, 0x40c00000, v47 -; GFX9-NEXT: v_bfe_u32 v56, v47, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_add3_u32 v56, v56, v47, s6 -; GFX9-NEXT: v_or_b32_e32 v57, 0x400000, v47 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v47, v47 -; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v47, v56, v57, vcc -; GFX9-NEXT: v_bfe_u32 v56, v7, 16, 1 -; GFX9-NEXT: v_add3_u32 v56, v56, v7, s6 -; GFX9-NEXT: v_or_b32_e32 v57, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v56, v57, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v56, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v56, 0x40c00000, v56 -; GFX9-NEXT: v_bfe_u32 v57, v56, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_add3_u32 v57, v57, v56, s6 -; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v56 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v56, v56 -; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v56, v57, v58, vcc -; GFX9-NEXT: v_bfe_u32 v57, v8, 16, 1 -; GFX9-NEXT: v_add3_u32 v57, v57, v8, s6 -; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v57, v58, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v57, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v57, 0x40c00000, v57 -; GFX9-NEXT: v_bfe_u32 v58, v57, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_add3_u32 v58, v58, v57, s6 -; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v57 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v57, v57 -; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v57, v58, v59, vcc -; GFX9-NEXT: v_bfe_u32 v58, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v58, v58, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v58, v59, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v58, 0x40c00000, v58 -; GFX9-NEXT: v_bfe_u32 v59, v58, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_add3_u32 v59, v59, v58, s6 -; GFX9-NEXT: v_or_b32_e32 v60, 0x400000, v58 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v58, v58 -; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v58, v59, v60, vcc -; GFX9-NEXT: v_bfe_u32 v59, v10, 16, 1 -; GFX9-NEXT: v_add3_u32 v59, v59, v10, s6 -; GFX9-NEXT: v_or_b32_e32 v60, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v59, v60, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v59, 0x40c00000, v59 -; GFX9-NEXT: v_bfe_u32 v60, v59, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add3_u32 v60, v60, v59, s6 -; GFX9-NEXT: v_or_b32_e32 v61, 0x400000, v59 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v59, v59 -; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v59, v60, v61, vcc -; GFX9-NEXT: v_bfe_u32 v60, v11, 16, 1 -; GFX9-NEXT: v_add3_u32 v60, v60, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v61, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v60, v61, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v60, 0x40c00000, v60 -; GFX9-NEXT: v_bfe_u32 v61, v60, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_add3_u32 v61, v61, v60, s6 -; GFX9-NEXT: v_or_b32_e32 v62, 0x400000, v60 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v60, v60 -; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v60, v61, v62, vcc -; GFX9-NEXT: v_bfe_u32 v61, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v61, v61, v12, s6 -; GFX9-NEXT: v_or_b32_e32 v62, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v61, v62, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v61, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v61, 0x40c00000, v61 -; GFX9-NEXT: v_bfe_u32 v62, v61, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_add3_u32 v62, v62, v61, s6 -; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v61 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v61, v61 -; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v61, v62, v63, vcc -; GFX9-NEXT: v_bfe_u32 v62, v13, 16, 1 -; GFX9-NEXT: v_add3_u32 v62, v62, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v62, v63, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v62, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX9-NEXT: v_add3_u32 v63, v63, v62, s6 -; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v62 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 -; GFX9-NEXT: v_bfe_u32 v62, v14, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v63, v0, vcc -; GFX9-NEXT: v_add3_u32 v62, v62, v14, s6 -; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v62, v63, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v62, 16, v15 -; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_add3_u32 v63, v63, v62, s6 -; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v62 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 -; GFX9-NEXT: v_bfe_u32 v62, v15, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v63, v0, vcc -; GFX9-NEXT: v_add3_u32 v62, v62, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v62, v63, vcc -; GFX9-NEXT: s_mov_b32 s6, 0x7060302 -; GFX9-NEXT: v_perm_b32 v15, v15, v0, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v1, s6 -; GFX9-NEXT: v_perm_b32 v1, v16, v41, s6 -; GFX9-NEXT: v_perm_b32 v0, v17, v40, s6 -; GFX9-NEXT: v_perm_b32 v17, v32, v33, s6 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v13, v13, v61, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v60, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v59, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v58, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v57, s6 -; GFX9-NEXT: v_perm_b32 v8, v8, v56, s6 -; GFX9-NEXT: v_perm_b32 v7, v7, v47, s6 -; GFX9-NEXT: v_perm_b32 v6, v6, v46, s6 -; GFX9-NEXT: v_perm_b32 v5, v5, v45, s6 -; GFX9-NEXT: v_perm_b32 v4, v4, v44, s6 -; GFX9-NEXT: v_perm_b32 v3, v3, v43, s6 -; GFX9-NEXT: v_perm_b32 v2, v2, v42, s6 -; GFX9-NEXT: v_perm_b32 v31, v31, v55, s6 -; GFX9-NEXT: v_perm_b32 v30, v30, v54, s6 -; GFX9-NEXT: v_perm_b32 v29, v29, v53, s6 -; GFX9-NEXT: v_perm_b32 v28, v28, v52, s6 -; GFX9-NEXT: v_perm_b32 v27, v27, v51, s6 -; GFX9-NEXT: v_perm_b32 v26, v26, v50, s6 -; GFX9-NEXT: v_perm_b32 v25, v25, v49, s6 -; GFX9-NEXT: v_perm_b32 v24, v24, v48, s6 -; GFX9-NEXT: v_perm_b32 v23, v23, v39, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v38, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v37, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v36, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v35, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v34, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v16, v32, v16, s6 -; GFX9-NEXT: .LBB49_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v3, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v33, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v33, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v2 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v1 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v33, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v33 +; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff0000, v17 +; GFX9-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX9-NEXT: v_bfe_u32 v35, v34, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v35, v35, v34 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc +; GFX9-NEXT: v_bfe_u32 v35, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v35, v35, v17 +; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v35, v36, vcc +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 +; GFX9-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v36, v36, v35 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_add_u32_e32 v36, 0x7fff, v36 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc +; GFX9-NEXT: v_bfe_u32 v36, v32, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v36, v36, v32 +; GFX9-NEXT: v_add_u32_e32 v36, 0x7fff, v36 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v36, v37, vcc +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX9-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v37, v37, v36 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_add_u32_e32 v37, 0x7fff, v37 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc +; GFX9-NEXT: v_bfe_u32 v37, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v37, v37, v19 +; GFX9-NEXT: v_add_u32_e32 v37, 0x7fff, v37 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v37, v38, vcc +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v20 +; GFX9-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX9-NEXT: v_bfe_u32 v38, v37, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v38, v38, v37 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_add_u32_e32 v38, 0x7fff, v38 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc +; GFX9-NEXT: v_bfe_u32 v38, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v38, v38, v20 +; GFX9-NEXT: v_add_u32_e32 v38, 0x7fff, v38 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v38, v39, vcc +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 +; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v39, v39, v38 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_add_u32_e32 v39, 0x7fff, v39 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v38 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc +; GFX9-NEXT: v_bfe_u32 v39, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v39, v39, v21 +; GFX9-NEXT: v_add_u32_e32 v39, 0x7fff, v39 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v39, v48, vcc +; GFX9-NEXT: v_and_b32_e32 v39, 0xffff0000, v22 +; GFX9-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX9-NEXT: v_bfe_u32 v48, v39, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v48, v48, v39 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_add_u32_e32 v48, 0x7fff, v48 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v39 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc +; GFX9-NEXT: v_bfe_u32 v48, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v48, v48, v22 +; GFX9-NEXT: v_add_u32_e32 v48, 0x7fff, v48 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff0000, v23 +; GFX9-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX9-NEXT: v_bfe_u32 v49, v48, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v49, v49, v48 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_add_u32_e32 v49, 0x7fff, v49 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v48 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc +; GFX9-NEXT: v_bfe_u32 v49, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v49, v49, v23 +; GFX9-NEXT: v_add_u32_e32 v49, 0x7fff, v49 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v49, v50, vcc +; GFX9-NEXT: v_and_b32_e32 v49, 0xffff0000, v24 +; GFX9-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; GFX9-NEXT: v_bfe_u32 v50, v49, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v50, v50, v49 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_add_u32_e32 v50, 0x7fff, v50 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v49 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc +; GFX9-NEXT: v_bfe_u32 v50, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v50, v50, v24 +; GFX9-NEXT: v_add_u32_e32 v50, 0x7fff, v50 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v50, v51, vcc +; GFX9-NEXT: v_and_b32_e32 v50, 0xffff0000, v25 +; GFX9-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; GFX9-NEXT: v_bfe_u32 v51, v50, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v51, v51, v50 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_add_u32_e32 v51, 0x7fff, v51 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc +; GFX9-NEXT: v_bfe_u32 v51, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v51, v51, v25 +; GFX9-NEXT: v_add_u32_e32 v51, 0x7fff, v51 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v51, v52, vcc +; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v26 +; GFX9-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; GFX9-NEXT: v_bfe_u32 v52, v51, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v52, v52, v51 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_add_u32_e32 v52, 0x7fff, v52 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v51 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc +; GFX9-NEXT: v_bfe_u32 v52, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v52, v52, v26 +; GFX9-NEXT: v_add_u32_e32 v52, 0x7fff, v52 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v52, v53, vcc +; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v27 +; GFX9-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; GFX9-NEXT: v_bfe_u32 v53, v52, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v53, v53, v52 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_add_u32_e32 v53, 0x7fff, v53 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc +; GFX9-NEXT: v_bfe_u32 v53, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v53, v53, v27 +; GFX9-NEXT: v_add_u32_e32 v53, 0x7fff, v53 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v53, v54, vcc +; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v28 +; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX9-NEXT: v_bfe_u32 v54, v53, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v54, v54, v53 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_add_u32_e32 v54, 0x7fff, v54 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc +; GFX9-NEXT: v_bfe_u32 v54, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v54, v54, v28 +; GFX9-NEXT: v_add_u32_e32 v54, 0x7fff, v54 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v54, v55, vcc +; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; GFX9-NEXT: v_bfe_u32 v55, v54, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v55, v55, v54 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_add_u32_e32 v55, 0x7fff, v55 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; GFX9-NEXT: v_bfe_u32 v55, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v55, v55, v29 +; GFX9-NEXT: v_add_u32_e32 v55, 0x7fff, v55 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v55, v40, vcc +; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v30 +; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v40, v40, v55 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_add_u32_e32 v40, 0x7fff, v40 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v55 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc +; GFX9-NEXT: v_bfe_u32 v40, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v40, v40, v30 +; GFX9-NEXT: v_add_u32_e32 v40, 0x7fff, v40 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v40, v41, vcc +; GFX9-NEXT: v_and_b32_e32 v40, 0xffff0000, v31 +; GFX9-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 +; GFX9-NEXT: v_bfe_u32 v41, v40, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v41, v41, v40 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_add_u32_e32 v41, 0x7fff, v41 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v40 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc +; GFX9-NEXT: v_bfe_u32 v41, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v41, v41, v31 +; GFX9-NEXT: v_add_u32_e32 v41, 0x7fff, v41 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v41, v42, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v40 +; GFX9-NEXT: v_and_b32_sdwa v31, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v55 +; GFX9-NEXT: v_and_b32_sdwa v30, v18, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; GFX9-NEXT: v_and_b32_sdwa v29, v18, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v53 +; GFX9-NEXT: v_and_b32_sdwa v28, v18, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; GFX9-NEXT: v_and_b32_sdwa v27, v18, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v51 +; GFX9-NEXT: v_and_b32_sdwa v26, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v50 +; GFX9-NEXT: v_and_b32_sdwa v25, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v49 +; GFX9-NEXT: v_and_b32_sdwa v24, v18, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX9-NEXT: v_and_b32_sdwa v23, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX9-NEXT: v_and_b32_sdwa v22, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX9-NEXT: v_and_b32_sdwa v21, v18, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX9-NEXT: v_and_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX9-NEXT: v_and_b32_sdwa v19, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX9-NEXT: v_and_b32_sdwa v32, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX9-NEXT: v_and_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v31, v40, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v30, v55, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v29, v54, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v28, v53, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v27, v52, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v26, v51, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v25, v50, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v24, v49, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v22, v39, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v21, v38, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v19, v36, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v32, v35, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v17, v34, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v16, v18, 16, v16 +; GFX9-NEXT: s_branch .LBB101_5 +; GFX9-NEXT: .LBB101_3: +; GFX9-NEXT: s_branch .LBB101_2 +; GFX9-NEXT: .LBB101_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: .LBB101_5: ; %end +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: v_readlane_b32 s31, v43, 1 +; GFX9-NEXT: v_readlane_b32 s30, v43, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16: +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v23 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v71, 0x40c00000, v71 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v21 -; GFX11-TRUE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v18 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v32, 16, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v16, v16, v32, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v24 -; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v26 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v17, v17, v39 :: v_dual_lshlrev_b32 v26, 16, v26 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v34, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v29 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v37, v32 :: v_dual_lshlrev_b32 v29, 16, v29 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v31 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v31, 16, v31 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v37, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v39 -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v80, 0x40c00000, v80 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_lshlrev_b32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v36, v38, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_add_f32 v39, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v19, v34, v35 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v36, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v39, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v82, 0x40c00000, v82 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_lshlrev_b32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v36, v39, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v39 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v49 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v35, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v38, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v48, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v84, 0x40c00000, v84 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v37, v48, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v48 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v49, 16, 1 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_lshlrev_b32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v86, 0x40c00000, v86 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v37, v38, v39, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v39 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h -; GFX11-TRUE16-NEXT: v_dual_add_f32 v87, 0x40c00000, v87 :: v_dual_lshlrev_b32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v96, 0x40c00000, v96 :: v_dual_cndmask_b32 v21, v37, v38 -; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v49, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v49 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v15 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v22, v37, v38 :: v_dual_add_f32 v49, 0x40c00000, v51 -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v50, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v50 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v51, 0x40c00000, v23 :: v_dual_add_f32 v14, 0x40c00000, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v50, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v48, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v49, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v98, 0x40c00000, v98 -; GFX11-TRUE16-NEXT: v_bfe_u32 v101, v14, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v39, v48, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v48 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-TRUE16-NEXT: v_add3_u32 v101, v101, v14, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v14 -; GFX11-TRUE16-NEXT: v_bfe_u32 v102, v98, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v114, 0x400000, v98 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v38, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v49, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v49 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v52 -; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v51, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX11-TRUE16-NEXT: v_add3_u32 v102, v102, v98, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v50, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v39, v48, v51, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v51 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v25 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v52, 0x40c00000, v24 :: v_dual_lshlrev_b32 v25, 16, v25 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v39, v48, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v39, v49, v50, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v50 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v53 -; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v52, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v51, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v48, v49, v52, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v52 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v52, 0x40c00000, v54 :: v_dual_add_f32 v53, 0x40c00000, v25 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v26 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v25, v48, v49 :: v_dual_and_b32 v26, 0xffff0000, v27 -; GFX11-TRUE16-NEXT: v_add3_u32 v48, v50, v51, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v51 -; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v53, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 -; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v52, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB101_4 +; GFX11-TRUE16-NEXT: .LBB101_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v49, v50, v53, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v53 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v54, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v50, v51, v52, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v52 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v26 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v49.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v50, v51, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v50, v53, v54, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v54 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v55, 16, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v54, 0x40c00000, v64 :: v_dual_add_f32 v53, 0x40c00000, v27 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v50, v51, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v50, v52, v55, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v55 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v53, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v54, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v28 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add3_u32 v51, v52, v53, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v53 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v51, v52, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v51, v55, v54, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v54 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v65 -; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v64, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v54, v55, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v52, v53, v64, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v64 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v30 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v65, 0x40c00000, v29 :: v_dual_lshlrev_b32 v30, 16, v30 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v52, v53, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v52, v54, v55, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v55 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v66 -; GFX11-TRUE16-NEXT: v_bfe_u32 v54, v65, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v64, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v53, v54, v65, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v65 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v65, 0x40c00000, v67 :: v_dual_add_f32 v66, 0x40c00000, v30 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v31 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v30, v53, v54 :: v_dual_and_b32 v31, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v53, v55, v64, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v64 -; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v66, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 -; GFX11-TRUE16-NEXT: v_bfe_u32 v64, v65, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v3, 16, v17 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v5, v8 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v54, v55, v66, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v66 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 -; GFX11-TRUE16-NEXT: v_bfe_u32 v66, v67, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v55, v64, v65, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v65 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v68, 0x40c00000, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v54.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v55, v64, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v55, v66, v67, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v67 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v68, 16, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v67, 0x40c00000, v69 :: v_dual_add_f32 v66, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v55, v64, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v55, v65, v68, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v68 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68 -; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v66, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v68, v67, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v69, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v55, v64, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v64, v65, v66, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v66 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v70, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v67 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v64, v64, v65, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v65, v68, v67, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v68, v69, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 -; GFX11-TRUE16-NEXT: v_bfe_u32 v67, v70, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v11, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v64.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v65, v66, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v65, v68, v69, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v69 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v69, v2, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v68, 0x40c00000, v68 :: v_dual_cndmask_b32 v65, v65, v66 -; GFX11-TRUE16-NEXT: v_add3_u32 v66, v67, v70, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v70 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 -; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v68, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v65.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v66, v66, v67, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v67, v69, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v65, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v28.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v67, v69, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v67, v70, v68, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v68 -; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v68, v70, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v67, v69, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v71, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v66 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v27.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v65, v50 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v25.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v68, v69, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v68, v70, v71, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v71 -; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71 -; GFX11-TRUE16-NEXT: v_bfe_u32 v71, v80, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v49, v48 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v17.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v68, v69, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v69, v70, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v4, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v8, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v22 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v67 -; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v66, v26 -; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v48, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v69, v70 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v69, v71, v80, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v80 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80 -; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v81, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v71, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v69, v69, v70, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_and_b32 v35, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v23 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_add_nc_u32 v2, v4, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v6 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v70, v71, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v68 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v0.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v64, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v30.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v70, v71, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v70, v80, v81, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v81 -; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 -; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v82, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v29.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v55, v52 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v23.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v70, v71, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v71, v80, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v64, v51 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v24.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v2, v4 :: v_dual_add_nc_u32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v7, v2 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_and_b32 v7, 0xffff0000, v25 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v5 :: v_dual_add_nc_u32 v5, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_lshlrev_b32 v2, 16, v25 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v5, v7 :: v_dual_add_nc_u32 v1, v3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_nc_u32 v3, v3, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v2 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v6 :: v_dual_and_b32 v5, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v7, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v71, v80 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: v_add3_u32 v71, v81, v82, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v82 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 -; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v83, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v54, v53 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v71, v80, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add3_u32 v80, v81, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v22.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v50, v39 -; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v51, v38 -; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v52, v37 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v80, v81, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v80, v82, v83, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v83 -; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v8, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 -; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v84, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v19.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v80, v81, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v81, v82, v8, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v18.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v71 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v4, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v3, 16, v28 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v0, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v4, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v8, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_add_f32 v1, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v5 :: v_dual_add_f32 v3, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v81, v82 :: v_dual_add_f32 v9, 0x40c00000, v9 -; GFX11-TRUE16-NEXT: v_add3_u32 v81, v83, v84, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v84 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 -; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v85, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v9, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v69 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v81, v82, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-TRUE16-NEXT: v_add3_u32 v82, v83, v9, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v9 -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v80 -; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v68, v31 -; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v53, v21 -; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v36, v35 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v82, v83, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v82, v84, v85, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v85 -; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v10, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 -; GFX11-TRUE16-NEXT: v_bfe_u32 v85, v86, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v37, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v38, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v82, v83, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v83, v84, v10, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v10 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v81 -; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v39, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s12, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v4, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s13, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s14, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_add_nc_u32 v5, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v10, v83, v84 :: v_dual_add_f32 v11, 0x40c00000, v11 -; GFX11-TRUE16-NEXT: v_add3_u32 v83, v85, v86, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v86 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 -; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v87, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v85, v11, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v83, v84, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add3_u32 v84, v85, v11, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v11 -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v82 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v84, v85, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v84, v86, v87, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v87 -; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v12, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 -; GFX11-TRUE16-NEXT: v_bfe_u32 v87, v96, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v84, v85, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v85, v86, v12, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v83 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v12, v85, v86 :: v_dual_add_f32 v13, 0x40c00000, v13 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_add3_u32 v85, v87, v96, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v14, v101, v112 :: v_dual_add_f32 v87, 0x40c00000, v97 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v86, v86, v13, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v13 -; GFX11-TRUE16-NEXT: v_bfe_u32 v99, v87, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v98, v102, v114 :: v_dual_add_f32 v15, 0x40c00000, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v103, 0x400000, v87 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 -; GFX11-TRUE16-NEXT: v_add3_u32 v99, v99, v87, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v97, 0x400000, v96 -; GFX11-TRUE16-NEXT: v_bfe_u32 v113, v15, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v98.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v99, v103, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v101, v113, v15, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v84 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v101, v112, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v87 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v98, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v86, v100, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v85, v97, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v85 -; GFX11-TRUE16-NEXT: .LBB49_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v64f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v69, 0x40c00000, v69 :: v_dual_lshlrev_b32 v70, 16, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 16, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v10 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v16 -; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v87, 0x40c00000, v87 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v2, 0x40c00000, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v16, v16, v32, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 16, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 16, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v17 -; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v11 -; GFX11-FAKE16-NEXT: v_add3_u32 v17, v17, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v17, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v34, v36, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v37, v17, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v18 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v37, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v33, v17, 0x7060302 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v34, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v37, v38, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v39, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v39, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v39 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v19 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v20 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v38, v37, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v48, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v38, v48, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v49, 0x40c00000, v20 :: v_dual_add_f32 v48, 0x40c00000, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v36, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v38, v39, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v39 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v49, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v48, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v35, v19, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v49, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v49 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v49, 0x40c00000, v22 :: v_dual_lshlrev_b32 v22, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v37, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v39, v48, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v48 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v49, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v22 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v50, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v37, v21, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v50, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v50 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v23 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v38, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v48, v49, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v49 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v51, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v50, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v39, v48, v51, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v51 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v52, 0x40c00000, v23 :: v_dual_add_f32 v51, 0x40c00000, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v39, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v39, v49, v50, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v50 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v51, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v24 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v39, v23, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v49, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v25 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v48, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v50, v51, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v51 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v53, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v50, v53, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v53 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v54, 0x40c00000, v25 :: v_dual_add_f32 v53, 0x40c00000, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v49, v50, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v51, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v53, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v26 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v48, v24, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v54, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v49, v25, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v51, v54, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v54 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v27 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v50, v51, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v52, v53, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v53 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v55, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v54, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v52, v55, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v55 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v64, 0x40c00000, v27 :: v_dual_add_f32 v55, 0x40c00000, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v51, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v53, v54, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v54 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 -; GFX11-FAKE16-NEXT: v_bfe_u32 v54, v55, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v65, 0x40c00000, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v50, v26, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v64, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v51, v27, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v53, v64, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v64 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v29 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v52, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v54, v55, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v55 -; GFX11-FAKE16-NEXT: v_bfe_u32 v54, v65, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v64, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v53, v54, v65, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v65 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v66, 0x40c00000, v29 :: v_dual_add_f32 v65, 0x40c00000, v30 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v53, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v53, v55, v64, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v64 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 -; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v65, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v52, v28, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v66, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v53, v29, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v54, v55, v66, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v66 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v66, 0x40c00000, v31 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v54, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v54, v64, v65, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v65 -; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v67, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v0 -; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v66, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v55, v64, v67, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v67 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v68, 0x40c00000, v31 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v67, 16, v1 -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v54, v30, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v55, v64, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v67 -; GFX11-FAKE16-NEXT: v_add3_u32 v55, v65, v66, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v66 -; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v68, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 -; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v5, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s16, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v9, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v9 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v67 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v7, v8 :: v_dual_add_nc_u32 v7, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v11, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v8, v9 :: v_dual_add_nc_u32 v8, v10, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v13, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v12, v13 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v10 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v14, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v13, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v14, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v12, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v82, v82, v14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v12, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v68, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v82, v83, v12 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v83, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v83, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 16, v13 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s25, 0xffff0000 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v83 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v82, v83 +; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v13 +; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v82, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v83, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v14, v14, v85 :: v_dual_add_nc_u32 v15, 0x7fff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v86, v86, v82 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v83, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v83 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v15, v85, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v86 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v86, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s26, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v82 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v102, 0x400000, v86 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v96, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s27, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v97, v82, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v85, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v85, v86, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v99, v96, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v98, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v97, v97, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v103, 0x400000, v82 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v99, v99, v96 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v85, v85, v86 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v97 +; GFX11-TRUE16-NEXT: v_bfe_u32 v101, v98, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v96 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v99 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v82, v97, v103 :: v_dual_add_nc_u32 v85, 0x7fff, v85 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v101, v101, v98 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v14, v83 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v99, v112, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v101, 0x400000, v98 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 16, v96 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v85, v102, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v65, 16, v67 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v86, v97, v101 :: v_dual_and_b32 v65, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v85 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff, v80 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 16, v86 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v14, v100, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xffff, v96 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v68 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v83, 16, v82 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v85, 16, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v87, 16, v83 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v9, 16, v86 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v97, 16, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v81, 16, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v84, 16, v85 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v69, 16, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v64, 16, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v70, 16, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v55, 16, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v51, 16, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v50, 16, v65 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v52, 16, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v38, 16, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v71, 16, v80 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v83 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v70 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v53, 16, v54 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v66 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v48, 16, v49 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v52 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v34, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v17, 16, v38 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB101_3: +; GFX11-TRUE16-NEXT: s_branch .LBB101_2 +; GFX11-TRUE16-NEXT: .LBB101_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v64f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB101_4 +; GFX11-FAKE16-NEXT: .LBB101_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v55, v64, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v64, v65, v68, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v68 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68 -; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v67, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v55, v31, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v64, v65, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v65, v66, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v3, 16, v17 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v65, v66 :: v_dual_and_b32 v1, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: v_add3_u32 v65, v68, v67, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v67 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v64, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v65, v66, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v5, v8 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v66, v68, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v69, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v66, v67, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v66, v68, v69, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v69 -; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v65, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v66, v67, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v68, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v67, v68 :: v_dual_and_b32 v3, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v70, 0x40c00000, v70 :: v_dual_add_f32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v66, 0x7060302 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v11, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v4, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v69, v70, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v70 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v69, v70, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v69, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v2, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v8, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v22 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v8, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_add_nc_u32 v2, v4, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v2, v4 :: v_dual_add_nc_u32 v0, v1, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_and_b32 v7, 0xffff0000, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v5 :: v_dual_add_nc_u32 v5, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_lshlrev_b32 v2, 16, v25 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v5, v7 :: v_dual_add_nc_u32 v1, v3, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_nc_u32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v3, v6 :: v_dual_and_b32 v5, 0xffff0000, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v7, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v2, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v4, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v3, 16, v28 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v0, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v4, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v5, 0x40c00000, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v67, v68, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v68, v69, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v4, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v68, v69 :: v_dual_and_b32 v4, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v71, 0x40c00000, v71 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v8, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v2, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v30 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v67, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v70, v71, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, 0x400000, v71 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v68, v70, v71, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v70, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v68, v68, v69, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v69, v70, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v70, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v69, v70 :: v_dual_and_b32 v5, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v68, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v80, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v70, 0x400000, v80 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80 -; GFX11-FAKE16-NEXT: v_add3_u32 v69, v71, v80, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v5, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v69, v69, v70, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v70, v71, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v70, v71 :: v_dual_and_b32 v6, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_add_f32 v6, 0x40c00000, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v69, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v80, v81, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v81 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v70, v80, v81, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v80, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v70, v71, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v71, v80, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_add_f32 v1, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v4 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v3, v5 :: v_dual_add_f32 v3, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s12, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v4, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v71, v80 :: v_dual_and_b32 v7, 0xffff0000, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v82, 0x40c00000, v82 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v70, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v81, v82, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v82 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 -; GFX11-FAKE16-NEXT: v_add3_u32 v71, v81, v82, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v81, v7, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v71, v71, v80, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v80, v81, v7, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, 0x400000, v7 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s13, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_add_nc_u32 v5, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v8, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v80, v81 :: v_dual_and_b32 v8, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_add_f32 v8, 0x40c00000, v8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v71, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v83, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, 0x400000, v83 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v80, v82, v83, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v8, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v80, v80, v81, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v81, v82, v8, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v81, v82 :: v_dual_and_b32 v9, 0xffff0000, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v84, 0x40c00000, v84 :: v_dual_add_f32 v9, 0x40c00000, v9 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v80, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v83, v84, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v84 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 -; GFX11-FAKE16-NEXT: v_add3_u32 v81, v83, v84, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v83, v9, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v81, v81, v82, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v82, v83, v9, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v83, 0x400000, v9 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s15, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v6 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v82, v83 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_add_f32 v10, 0x40c00000, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v81, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v84, v85, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v83, 0x400000, v85 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v82, v84, v85, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v84, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v82, v82, v83, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v83, v84, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v83, v84 :: v_dual_and_b32 v11, 0xffff0000, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v86, 0x40c00000, v86 :: v_dual_add_f32 v11, 0x40c00000, v11 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v82, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v86, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v86 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_add3_u32 v83, v85, v86, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v11, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v83, v83, v84 :: v_dual_lshlrev_b32 v84, 16, v13 -; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v87, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v97, 0x400000, v87 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v84, 0x40c00000, v84 -; GFX11-FAKE16-NEXT: v_add3_u32 v86, v86, v87, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-FAKE16-NEXT: v_bfe_u32 v98, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v84, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v85, v85, v11, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v86, v97, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v97, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_add3_u32 v87, v98, v12, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v98, v99, v84, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 16, v14 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v84 -; GFX11-FAKE16-NEXT: v_bfe_u32 v101, v13, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v87, v97 :: v_dual_add_f32 v87, 0x40c00000, v99 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v97, v101, v13, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v86, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v87, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v84, v98, v100, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 16, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v101, 0x400000, v87 -; GFX11-FAKE16-NEXT: v_bfe_u32 v102, v14, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v99, v99, v87, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s16, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v9 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v98, 0x40c00000, v98 :: v_dual_cndmask_b32 v87, v99, v101 -; GFX11-FAKE16-NEXT: v_add3_u32 v101, v102, v14, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v14 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-FAKE16-NEXT: v_bfe_u32 v103, v98, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v98 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v67 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v7, v8 :: v_dual_add_nc_u32 v7, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v11, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s18, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v101, v102 :: v_dual_add_f32 v15, 0x40c00000, v15 -; GFX11-FAKE16-NEXT: v_add3_u32 v103, v103, v98, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v87, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v15, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v113, 0x400000, v15 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v98, v103, v112, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v99, v99, v15, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v99, v113, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v8, v9 :: v_dual_add_nc_u32 v8, v10, v11 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v9, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s20, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v13 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v98, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v97, v100, vcc_lo +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s21, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s22, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v11 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s23, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v12, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v82, v82, v14 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v84, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v85, v96, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v83, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB49_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_bfe_u32 v83, v12, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v68, 16, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v82, v83, v12 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v83, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s24, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v83, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v13 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s25, 0xffff0000 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, 0x400000, v83 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v82, v83 +; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s25, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v13 +; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v82, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v83, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v85 :: v_dual_add_nc_u32 v15, 0x7fff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v86, v86, v82 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v83, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v83 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v15, v85, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v86 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v86, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s26, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, 0x400000, v82 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v86 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v96, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s27, 0xffff0000 +; GFX11-FAKE16-NEXT: v_bfe_u32 v97, v82, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v15, v85, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v86, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v96, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v98, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v97, v97, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v103, 0x400000, v82 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v99, v99, v96 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v85, v85, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v97 +; GFX11-FAKE16-NEXT: v_bfe_u32 v101, v98, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v96 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v99 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v82, v97, v103 :: v_dual_add_nc_u32 v85, 0x7fff, v85 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v101, v101, v98 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v14, v83 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v96, v99, v112, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v101, 0x400000, v98 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v96 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v85, v85, v102, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v65, 16, v67 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v86, v97, v101 :: v_dual_and_b32 v65, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v85 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xffff, v80 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v86 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v100, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff, v96 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v1, 16, v68 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v83, 16, v82 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v85, 16, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v83, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v96, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v87, 16, v83 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v9, 16, v86 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v97, 16, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v81, 16, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v83, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v84, 16, v85 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v69, 16, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v64, 16, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v70, 16, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v55, 16, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v50, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v52, 16, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v38, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v71, 16, v80 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v83 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v53, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v17, 16, v38 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB101_3: +; GFX11-FAKE16-NEXT: s_branch .LBB101_2 +; GFX11-FAKE16-NEXT: .LBB101_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -108746,1280 +234012,1299 @@ end: } define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v64f16_to_v64bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v61 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v62 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB50_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v15 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v62 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: .LBB50_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB50_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v63 -; GCN-NEXT: v_add_f32_e32 v62, 0x38000000, v62 -; GCN-NEXT: v_add_f32_e32 v61, 0x38000000, v61 -; GCN-NEXT: v_add_f32_e32 v60, 0x38000000, v60 -; GCN-NEXT: v_add_f32_e32 v59, 0x38000000, v59 -; GCN-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; GCN-NEXT: v_add_f32_e32 v57, 0x38000000, v57 -; GCN-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; GCN-NEXT: v_add_f32_e32 v47, 0x38000000, v47 -; GCN-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; GCN-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; GCN-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; GCN-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; GCN-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; GCN-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; GCN-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; GCN-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; GCN-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v19 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v10 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v14 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v16 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v17 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v18 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v62 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: .LBB50_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v45, v1, v2, 16 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v44, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v47, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v46, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v2, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 20, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v3, v1, v3, 16 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v5, v1, v5, 16 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v7, v1, v7, 16 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v9, v1, v9, 16 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v11, v1, v11, 16 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v13, v1, v13, 16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v15, v1, v15, 16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v17, v1, v17, 16 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v19, v1, v19, 16 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v21, v1, v21, 16 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v23, v1, v23, 16 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v25, v1, v25, 16 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v27, v1, v27, 16 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v29, v1, v29, 16 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v31, v1, v31, 16 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v33, v1, v33, 16 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v35, v1, v35, 16 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v37, v1, v37, 16 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v39, v1, v39, 16 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v49, v1, v49, 16 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v51, v1, v51, 16 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v53, v1, v53, 16 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v55, v1, v55, 16 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v41, v1, v41, 16 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v43, 16 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v45, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64f16_to_v64bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB102_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB102_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB102_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v63 +; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v60 +; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v61 +; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v47 +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v57 +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v56 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v42 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v44 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v45 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v54 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v40 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v41 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v50 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v52 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v53 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v38 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v58 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v3, v24 +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_mov_b32_e32 v4, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: .LBB102_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v64bf16: ; VI: ; %bb.0: @@ -110031,7 +235316,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: s_cbranch_execz .LBB102_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 0x200 ; VI-NEXT: v_add_f16_e32 v33, 0x200, v15 @@ -110131,7 +235416,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 ; VI-NEXT: v_or_b32_e32 v17, v33, v17 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB50_2: ; %end +; VI-NEXT: .LBB102_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -110146,7 +235431,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB50_2 +; GFX9-NEXT: s_cbranch_execz .LBB102_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -110182,7 +235467,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB50_2: ; %end +; GFX9-NEXT: .LBB102_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -110199,7 +235484,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -110234,7 +235519,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB50_2: ; %end +; GFX11-NEXT: .LBB102_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -110255,1184 +235540,2661 @@ end: ret <64 x bfloat> %phi } +define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64f16_to_v64bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_mov_b32_e32 v46, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s24 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v38 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v48 +; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v44, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB103_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v29 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v30 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v50 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: v_mov_b32_e32 v51, v22 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_mov_b32_e32 v37, v45 +; SI-NEXT: v_mov_b32_e32 v27, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_mov_b32_e32 v49, v47 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v62 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_mov_b32_e32 v57, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v34, v5 +; SI-NEXT: v_mov_b32_e32 v58, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v44 +; SI-NEXT: v_mov_b32_e32 v44, v18 +; SI-NEXT: v_mov_b32_e32 v5, v43 +; SI-NEXT: v_mov_b32_e32 v18, v6 +; SI-NEXT: s_branch .LBB103_3 +; SI-NEXT: .LBB103_2: +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: v_mov_b32_e32 v49, v47 +; SI-NEXT: v_mov_b32_e32 v27, v26 +; SI-NEXT: v_mov_b32_e32 v37, v45 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v51, v22 +; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: v_mov_b32_e32 v5, v6 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: .LBB103_3: ; %Flow +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v36, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v9 +; SI-NEXT: v_mov_b32_e32 v12, v31 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v31, v11 +; SI-NEXT: v_mov_b32_e32 v9, v17 +; SI-NEXT: s_cbranch_vccnz .LBB103_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v62 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v15 +; SI-NEXT: v_mov_b32_e32 v6, v37 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v52 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v51 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v50 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v18 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v46 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 +; SI-NEXT: v_mov_b32_e32 v16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v37 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_mov_b32_e32 v4, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_mov_b32_e32 v3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; SI-NEXT: .LBB103_5: ; %end +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v64bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB103_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB103_3 +; VI-NEXT: .LBB103_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v18, 0x200 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v15 +; VI-NEXT: v_add_f16_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v33, v15 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v14 +; VI-NEXT: v_add_f16_sdwa v14, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v33, v14 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v13 +; VI-NEXT: v_add_f16_sdwa v13, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v33, v13 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v12 +; VI-NEXT: v_add_f16_sdwa v12, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v33, v12 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v11 +; VI-NEXT: v_add_f16_sdwa v11, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v33, v11 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v10 +; VI-NEXT: v_add_f16_sdwa v10, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v33, v10 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v9 +; VI-NEXT: v_add_f16_sdwa v9, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v33, v9 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v8, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v33, v8 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v7, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v33, v7 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v6, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v33, v6 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v5, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v33, v5 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v33, v4 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v3, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v33, v3 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v33, v2 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v33, v1 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v33, v0 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v31 +; VI-NEXT: v_add_f16_sdwa v31, v31, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v31, v33, v31 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v30 +; VI-NEXT: v_add_f16_sdwa v30, v30, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v33, v30 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v29 +; VI-NEXT: v_add_f16_sdwa v29, v29, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v33, v29 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v28 +; VI-NEXT: v_add_f16_sdwa v28, v28, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v33, v28 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v27 +; VI-NEXT: v_add_f16_sdwa v27, v27, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v33, v27 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v26 +; VI-NEXT: v_add_f16_sdwa v26, v26, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v33, v26 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v25 +; VI-NEXT: v_add_f16_sdwa v25, v25, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v33, v25 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v24 +; VI-NEXT: v_add_f16_sdwa v24, v24, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v33, v24 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v23 +; VI-NEXT: v_add_f16_sdwa v23, v23, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v33, v23 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v22 +; VI-NEXT: v_add_f16_sdwa v22, v22, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v33, v22 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v21 +; VI-NEXT: v_add_f16_sdwa v21, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v33, v21 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v20 +; VI-NEXT: v_add_f16_sdwa v20, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v33, v20 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v19 +; VI-NEXT: v_add_f16_sdwa v19, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v33, v19 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v32 +; VI-NEXT: v_add_f16_sdwa v32, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v32, v33, v32 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v18, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v33, v17 +; VI-NEXT: v_or_b32_e32 v16, v16, v18 +; VI-NEXT: .LBB103_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB103_4: +; VI-NEXT: s_branch .LBB103_2 +; +; GFX9-LABEL: bitcast_v64f16_to_v64bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB103_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB103_3 +; GFX9-NEXT: .LBB103_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v31, v31, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, v32, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB103_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB103_4: +; GFX9-NEXT: s_branch .LBB103_2 +; +; GFX11-LABEL: bitcast_v64f16_to_v64bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB103_4 +; GFX11-NEXT: .LBB103_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB103_3: +; GFX11-NEXT: s_branch .LBB103_2 +; GFX11-NEXT: .LBB103_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v64bf16_to_v64i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v27 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v28 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v30 -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v55 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v38 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v59 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v58 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v57 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v56 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v46 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v45 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v43 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v42 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v54 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v48 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v39 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v5 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v26 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v11 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB51_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v36 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v35 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v34 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v33 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v32 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v31 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v57 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v59 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: .LBB51_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB51_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v35 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v35 -; GCN-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v33 -; GCN-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v31 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v31 -; GCN-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GCN-NEXT: v_alignbit_b32 v10, v26, v10, 16 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GCN-NEXT: v_alignbit_b32 v8, v10, v8, 16 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; GCN-NEXT: v_alignbit_b32 v4, v8, v4, 16 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GCN-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v61 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_alignbit_b32 v4, v4, v8, 16 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v41 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v25 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_alignbit_b32 v8, v8, v10, 16 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v24 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GCN-NEXT: v_alignbit_b32 v10, v10, v24, 16 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v37 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; GCN-NEXT: v_alignbit_b32 v21, v21, v22, 16 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; GCN-NEXT: v_alignbit_b32 v18, v21, v18, 16 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; GCN-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v59 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v60 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v57 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v43 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v34 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v51 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v53 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v40, 0xffff0000, v40 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v32 -; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v34 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v36 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v38 -; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v48 -; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v49 -; GCN-NEXT: v_add_f32_e32 v39, 0x40c00000, v50 -; GCN-NEXT: v_add_f32_e32 v56, 0x40c00000, v51 -; GCN-NEXT: v_add_f32_e32 v48, 0x40c00000, v52 -; GCN-NEXT: v_add_f32_e32 v57, 0x40c00000, v53 -; GCN-NEXT: v_add_f32_e32 v49, 0x40c00000, v54 -; GCN-NEXT: v_add_f32_e32 v59, 0x40c00000, v55 -; GCN-NEXT: v_add_f32_e32 v50, 0x40c00000, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v27 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v28 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v30 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v37 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v48 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v49 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v50 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v35 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v39, v51, v12, 16 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v49, v53, v7, 16 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v50, v54, v16, 16 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_alignbit_b32 v51, v21, v18, 16 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_alignbit_b32 v53, v23, v22, 16 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_alignbit_b32 v54, v25, v24, 16 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v55, v26, v41, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v26, v40 -; GCN-NEXT: v_alignbit_b32 v40, v26, v42, 16 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v42, v44, v43, 16 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v44, v45, v32, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v45, v27, v34, 16 -; GCN-NEXT: v_alignbit_b32 v46, v28, v36, 16 -; GCN-NEXT: v_alignbit_b32 v47, v29, v38, 16 -; GCN-NEXT: v_alignbit_b32 v56, v30, v56, 16 -; GCN-NEXT: v_alignbit_b32 v58, v48, v57, 16 -; GCN-NEXT: v_alignbit_b32 v62, v52, v59, 16 -; GCN-NEXT: v_alignbit_b32 v7, v62, v20, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v58, v19, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v56, v17, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v47, v9, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v6, v46, v6, 16 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v3, v45, v3, 16 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v1, v44, v1, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v42, v2, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v40, v4, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v55, v8, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v10, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v15, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v14, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v50, v13, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v49, v5, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v11, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: .LBB51_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v59, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; GCN-NEXT: v_or_b32_e32 v57, v1, v2 -; GCN-NEXT: v_add_i32_e32 v63, vcc, 8, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v58, v1, v2 -; GCN-NEXT: v_add_i32_e32 v62, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; GCN-NEXT: v_or_b32_e32 v56, v1, v2 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_add_i32_e32 v60, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v28 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v44 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v42 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v26 -; GCN-NEXT: v_or_b32_e32 v22, v22, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v55 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v28, v28, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v54 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v30, v30, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v32, v32, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v53 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v34, v34, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v36, v36, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff, v51 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v38, v38, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v51 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v51, v51, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v50, v50, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v54, v54, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v49 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v49, v49, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v41, v41, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v39 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v39, v39, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v59, v63, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v61, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v60, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64bf16_to_v64i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v46 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v59 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v47 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v63 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v45 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB104_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v43 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v20 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v31 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v40 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v55 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v54 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v51 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: .LBB104_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB104_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v33 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v31 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v55 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v32 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v51 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v51 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_alignbit_b32 v15, v26, v15, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_alignbit_b32 v15, v19, v15, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_alignbit_b32 v15, v18, v15, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_alignbit_b32 v15, v17, v15, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_alignbit_b32 v10, v15, v10, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_alignbit_b32 v8, v10, v8, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v6, v8, v6, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_alignbit_b32 v3, v6, v3, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v60 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_alignbit_b32 v3, v8, v3, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v28 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_alignbit_b32 v3, v10, v3, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v2, v4, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v19, v10, v4, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v3, v15, v3, 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v21, v10, v8, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v22, v10, v8, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v10, v8, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v3, v16, v3, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v23 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v27, v16, v3, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v5, v20, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v22, v1, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v10, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v7, v23, v7, 16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_alignbit_b32 v24, v45, v8, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v8, v24, v8, 16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v41, v42, v15, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v31 +; SI-NEXT: v_alignbit_b32 v62, v63, v16, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 +; SI-NEXT: v_alignbit_b32 v15, v41, v15, 16 +; SI-NEXT: v_alignbit_b32 v16, v62, v16, 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_alignbit_b32 v26, v34, v9, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v9, v26, v9, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v35, v36, v10, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v10, v35, v10, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v37, v38, v11, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v11, v37, v11, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v39, v48, v12, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v12, v39, v12, 16 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v49, v50, v13, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v51 +; SI-NEXT: v_alignbit_b32 v13, v49, v13, 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v52, v53, v14, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v14, v52, v14, 16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v21, v6, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v19, v4, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v18, v2, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v27, v3, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: .LBB104_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v64i16: ; VI: ; %bb.0: @@ -111460,7 +238222,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB51_2 +; VI-NEXT: s_cbranch_execz .LBB104_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 @@ -112046,7 +238808,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 -; VI-NEXT: .LBB51_2: ; %end +; VI-NEXT: .LBB104_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -112093,7 +238855,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB51_2 +; GFX9-NEXT: s_cbranch_execz .LBB104_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 ; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 @@ -112583,7 +239345,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v18, v18, v34, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v16, v32, v16, s6 -; GFX9-NEXT: .LBB51_2: ; %end +; GFX9-NEXT: .LBB104_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -112616,7 +239378,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB104_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v16 @@ -113182,7 +239944,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v37, 16, v38 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v39, 16, v16 -; GFX11-TRUE16-NEXT: .LBB51_2: ; %end +; GFX11-TRUE16-NEXT: .LBB104_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -113199,7 +239961,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB104_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v16 @@ -113709,7 +240471,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v85, v96, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v83, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB51_2: ; %end +; GFX11-FAKE16-NEXT: .LBB104_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -113730,865 +240492,3837 @@ end: ret <64 x i16> %phi } +define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64bf16_to_v64i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v30 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v50 +; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v53 +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v42 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB105_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v5 +; SI-NEXT: v_mov_b32_e32 v42, v62 +; SI-NEXT: v_mov_b32_e32 v43, v63 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v25 +; SI-NEXT: v_mov_b32_e32 v25, v60 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v40, v20 +; SI-NEXT: v_mov_b32_e32 v51, v61 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_mov_b32_e32 v29, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_mov_b32_e32 v24, v56 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_mov_b32_e32 v52, v10 +; SI-NEXT: v_mov_b32_e32 v53, v59 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_mov_b32_e32 v62, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v41, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v48 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 +; SI-NEXT: s_branch .LBB105_3 +; SI-NEXT: .LBB105_2: +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v5 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_mov_b32_e32 v51, v61 +; SI-NEXT: v_mov_b32_e32 v42, v62 +; SI-NEXT: v_mov_b32_e32 v29, v31 +; SI-NEXT: v_mov_b32_e32 v25, v60 +; SI-NEXT: v_mov_b32_e32 v24, v56 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v40, v20 +; SI-NEXT: v_mov_b32_e32 v43, v63 +; SI-NEXT: v_mov_b32_e32 v52, v10 +; SI-NEXT: v_mov_b32_e32 v53, v59 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: .LBB105_3: ; %Flow +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB105_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_alignbit_b32 v1, v9, v1, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_alignbit_b32 v1, v11, v1, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_alignbit_b32 v1, v14, v1, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; SI-NEXT: v_alignbit_b32 v51, v16, v1, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_alignbit_b32 v1, v18, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v18, v20, v1, 16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_alignbit_b32 v1, v22, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v1, v23, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v1, v26, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v29 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v33 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v1, v27, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v1, v28, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v52, v30, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v36, v35, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v32 +; SI-NEXT: v_alignbit_b32 v48, v49, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v28, v59, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v26, v28, v26, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v46, v61, v31, 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v21, v30, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v23, v10, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v63, v23, v27, 16 +; SI-NEXT: v_alignbit_b32 v27, v21, v12, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v57, v58, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v17, v1, v20, 16 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24 +; SI-NEXT: v_alignbit_b32 v56, v47, v20, 16 +; SI-NEXT: v_alignbit_b32 v20, v62, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v16, v56, v16, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v22, v45, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v29 +; SI-NEXT: v_alignbit_b32 v13, v60, v25, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; SI-NEXT: v_alignbit_b32 v24, v44, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v9, v11, v9, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v15 +; SI-NEXT: v_mov_b32_e32 v15, v24 +; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v3, v39, 16 +; SI-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v9, v5, 16 +; SI-NEXT: v_alignbit_b32 v5, v36, v7, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, v46, v33, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v24, v38, 16 +; SI-NEXT: v_alignbit_b32 v38, v48, v8, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v4, v22, v37, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v57, v32, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v4, v20, v34, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v52 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v13, v14, 16 +; SI-NEXT: v_mov_b32_e32 v14, v51 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: .LBB105_5: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v64i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v42, s30, 0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_writelane_b32 v42, s31, 1 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s31, v1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB105_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB105_4 +; VI-NEXT: .LBB105_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_alignbit_b32 v15, v4, v3, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v18, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v18, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v2 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v18, v33, vcc +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v18, v1, 16 +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v17 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 +; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc +; VI-NEXT: v_bfe_u32 v35, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v32 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_cndmask_b32_e32 v32, v35, v36, vcc +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 +; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc +; VI-NEXT: v_bfe_u32 v36, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v19 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; VI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; VI-NEXT: v_bfe_u32 v37, v36, 16, 1 +; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v36 +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc +; VI-NEXT: v_bfe_u32 v37, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v20 +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 +; VI-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; VI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; VI-NEXT: v_bfe_u32 v38, v37, 16, 1 +; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v37 +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc +; VI-NEXT: v_bfe_u32 v38, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v21 +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 +; VI-NEXT: v_or_b32_e32 v39, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 +; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v38 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v48, 0x400000, v38 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc +; VI-NEXT: v_bfe_u32 v39, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v22 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 +; VI-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; VI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; VI-NEXT: v_bfe_u32 v48, v39, 16, 1 +; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v39 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v49, 0x400000, v39 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc +; VI-NEXT: v_bfe_u32 v48, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v23 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 +; VI-NEXT: v_or_b32_e32 v49, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; VI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; VI-NEXT: v_bfe_u32 v49, v48, 16, 1 +; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v48 +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v50, 0x400000, v48 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc +; VI-NEXT: v_bfe_u32 v49, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v24 +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 +; VI-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v25 +; VI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; VI-NEXT: v_bfe_u32 v50, v49, 16, 1 +; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v49 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v51, 0x400000, v49 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc +; VI-NEXT: v_bfe_u32 v50, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v25 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 +; VI-NEXT: v_or_b32_e32 v51, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v26 +; VI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; VI-NEXT: v_bfe_u32 v51, v50, 16, 1 +; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v50 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc +; VI-NEXT: v_bfe_u32 v51, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v26 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; VI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; VI-NEXT: v_bfe_u32 v52, v51, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v51 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v51 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc +; VI-NEXT: v_bfe_u32 v52, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v27 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v28 +; VI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; VI-NEXT: v_bfe_u32 v53, v52, 16, 1 +; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v52 +; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc +; VI-NEXT: v_bfe_u32 v53, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v28 +; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 +; VI-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; VI-NEXT: v_bfe_u32 v54, v53, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v53 +; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc +; VI-NEXT: v_bfe_u32 v54, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v29 +; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 +; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 +; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; VI-NEXT: v_bfe_u32 v55, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v30 +; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 +; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v55 +; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v55 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc +; VI-NEXT: v_bfe_u32 v40, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v31 +; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v31, v31, v55, 16 +; VI-NEXT: v_alignbit_b32 v30, v30, v54, 16 +; VI-NEXT: v_alignbit_b32 v29, v29, v53, 16 +; VI-NEXT: v_alignbit_b32 v28, v28, v52, 16 +; VI-NEXT: v_alignbit_b32 v27, v27, v51, 16 +; VI-NEXT: v_alignbit_b32 v26, v26, v50, 16 +; VI-NEXT: v_alignbit_b32 v25, v25, v49, 16 +; VI-NEXT: v_alignbit_b32 v24, v24, v48, 16 +; VI-NEXT: v_alignbit_b32 v23, v23, v39, 16 +; VI-NEXT: v_alignbit_b32 v22, v22, v38, 16 +; VI-NEXT: v_alignbit_b32 v21, v21, v37, 16 +; VI-NEXT: v_alignbit_b32 v20, v20, v36, 16 +; VI-NEXT: v_alignbit_b32 v19, v19, v35, 16 +; VI-NEXT: v_alignbit_b32 v32, v32, v34, 16 +; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16 +; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: s_branch .LBB105_5 +; VI-NEXT: .LBB105_3: +; VI-NEXT: s_branch .LBB105_2 +; VI-NEXT: .LBB105_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: .LBB105_5: ; %end +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_readlane_b32 s31, v42, 1 +; VI-NEXT: v_readlane_b32 s30, v42, 0 +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v64i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v43, s30, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_writelane_b32 v43, s31, 1 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s31, v1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB105_4 +; GFX9-NEXT: .LBB105_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s4, s31, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff0000 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_and_or_b32 v14, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_and_or_b32 v15, v3, v18, v4 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v13, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v12, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v11, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v10, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v9, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v8, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v7, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v6, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v5, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v4, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v33, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v3, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v33, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v2 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v2, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v1 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v1, v1, v18, v33 +; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v33, v18, v0 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff0000, v17 +; GFX9-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX9-NEXT: v_bfe_u32 v35, v34, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v35, v35, v34 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc +; GFX9-NEXT: v_bfe_u32 v35, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v35, v35, v17 +; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v35, v36, vcc +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 +; GFX9-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v36, v36, v35 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_add_u32_e32 v36, 0x7fff, v36 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc +; GFX9-NEXT: v_bfe_u32 v36, v32, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v36, v36, v32 +; GFX9-NEXT: v_add_u32_e32 v36, 0x7fff, v36 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v36, v37, vcc +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX9-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v37, v37, v36 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_add_u32_e32 v37, 0x7fff, v37 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc +; GFX9-NEXT: v_bfe_u32 v37, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v37, v37, v19 +; GFX9-NEXT: v_add_u32_e32 v37, 0x7fff, v37 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v37, v38, vcc +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v20 +; GFX9-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX9-NEXT: v_bfe_u32 v38, v37, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v38, v38, v37 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_add_u32_e32 v38, 0x7fff, v38 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc +; GFX9-NEXT: v_bfe_u32 v38, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v38, v38, v20 +; GFX9-NEXT: v_add_u32_e32 v38, 0x7fff, v38 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v38, v39, vcc +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 +; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v39, v39, v38 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_add_u32_e32 v39, 0x7fff, v39 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v38 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc +; GFX9-NEXT: v_bfe_u32 v39, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v39, v39, v21 +; GFX9-NEXT: v_add_u32_e32 v39, 0x7fff, v39 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v39, v48, vcc +; GFX9-NEXT: v_and_b32_e32 v39, 0xffff0000, v22 +; GFX9-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX9-NEXT: v_bfe_u32 v48, v39, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v48, v48, v39 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_add_u32_e32 v48, 0x7fff, v48 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v39 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc +; GFX9-NEXT: v_bfe_u32 v48, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v48, v48, v22 +; GFX9-NEXT: v_add_u32_e32 v48, 0x7fff, v48 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff0000, v23 +; GFX9-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX9-NEXT: v_bfe_u32 v49, v48, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v49, v49, v48 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_add_u32_e32 v49, 0x7fff, v49 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v48 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc +; GFX9-NEXT: v_bfe_u32 v49, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v49, v49, v23 +; GFX9-NEXT: v_add_u32_e32 v49, 0x7fff, v49 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v49, v50, vcc +; GFX9-NEXT: v_and_b32_e32 v49, 0xffff0000, v24 +; GFX9-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; GFX9-NEXT: v_bfe_u32 v50, v49, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v50, v50, v49 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_add_u32_e32 v50, 0x7fff, v50 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v49 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc +; GFX9-NEXT: v_bfe_u32 v50, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v50, v50, v24 +; GFX9-NEXT: v_add_u32_e32 v50, 0x7fff, v50 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v50, v51, vcc +; GFX9-NEXT: v_and_b32_e32 v50, 0xffff0000, v25 +; GFX9-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; GFX9-NEXT: v_bfe_u32 v51, v50, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v51, v51, v50 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_add_u32_e32 v51, 0x7fff, v51 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc +; GFX9-NEXT: v_bfe_u32 v51, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v51, v51, v25 +; GFX9-NEXT: v_add_u32_e32 v51, 0x7fff, v51 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v51, v52, vcc +; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v26 +; GFX9-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; GFX9-NEXT: v_bfe_u32 v52, v51, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v52, v52, v51 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_add_u32_e32 v52, 0x7fff, v52 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v51 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc +; GFX9-NEXT: v_bfe_u32 v52, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v52, v52, v26 +; GFX9-NEXT: v_add_u32_e32 v52, 0x7fff, v52 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v52, v53, vcc +; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v27 +; GFX9-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; GFX9-NEXT: v_bfe_u32 v53, v52, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v53, v53, v52 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_add_u32_e32 v53, 0x7fff, v53 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc +; GFX9-NEXT: v_bfe_u32 v53, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v53, v53, v27 +; GFX9-NEXT: v_add_u32_e32 v53, 0x7fff, v53 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v53, v54, vcc +; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v28 +; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX9-NEXT: v_bfe_u32 v54, v53, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v54, v54, v53 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_add_u32_e32 v54, 0x7fff, v54 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc +; GFX9-NEXT: v_bfe_u32 v54, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v54, v54, v28 +; GFX9-NEXT: v_add_u32_e32 v54, 0x7fff, v54 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v54, v55, vcc +; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; GFX9-NEXT: v_bfe_u32 v55, v54, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v55, v55, v54 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_add_u32_e32 v55, 0x7fff, v55 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; GFX9-NEXT: v_bfe_u32 v55, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v55, v55, v29 +; GFX9-NEXT: v_add_u32_e32 v55, 0x7fff, v55 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v55, v40, vcc +; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v30 +; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v40, v40, v55 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_add_u32_e32 v40, 0x7fff, v40 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v55 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc +; GFX9-NEXT: v_bfe_u32 v40, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v40, v40, v30 +; GFX9-NEXT: v_add_u32_e32 v40, 0x7fff, v40 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v40, v41, vcc +; GFX9-NEXT: v_and_b32_e32 v40, 0xffff0000, v31 +; GFX9-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 +; GFX9-NEXT: v_bfe_u32 v41, v40, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v41, v41, v40 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_add_u32_e32 v41, 0x7fff, v41 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v40 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc +; GFX9-NEXT: v_bfe_u32 v41, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v41, v41, v31 +; GFX9-NEXT: v_add_u32_e32 v41, 0x7fff, v41 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v41, v42, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_and_or_b32 v31, v40, v18, v31 +; GFX9-NEXT: v_and_or_b32 v30, v55, v18, v30 +; GFX9-NEXT: v_and_or_b32 v29, v54, v18, v29 +; GFX9-NEXT: v_and_or_b32 v28, v53, v18, v28 +; GFX9-NEXT: v_and_or_b32 v27, v52, v18, v27 +; GFX9-NEXT: v_and_or_b32 v26, v51, v18, v26 +; GFX9-NEXT: v_and_or_b32 v25, v50, v18, v25 +; GFX9-NEXT: v_and_or_b32 v24, v49, v18, v24 +; GFX9-NEXT: v_and_or_b32 v23, v48, v18, v23 +; GFX9-NEXT: v_and_or_b32 v22, v39, v18, v22 +; GFX9-NEXT: v_and_or_b32 v21, v38, v18, v21 +; GFX9-NEXT: v_and_or_b32 v20, v37, v18, v20 +; GFX9-NEXT: v_and_or_b32 v19, v36, v18, v19 +; GFX9-NEXT: v_and_or_b32 v32, v35, v18, v32 +; GFX9-NEXT: v_and_or_b32 v17, v34, v18, v17 +; GFX9-NEXT: v_and_or_b32 v16, v33, v18, v16 +; GFX9-NEXT: s_branch .LBB105_5 +; GFX9-NEXT: .LBB105_3: +; GFX9-NEXT: s_branch .LBB105_2 +; GFX9-NEXT: .LBB105_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: .LBB105_5: ; %end +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: v_readlane_b32 s31, v43, 1 +; GFX9-NEXT: v_readlane_b32 s30, v43, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64bf16_to_v64i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-NEXT: .LBB105_2: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 +; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s24, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v0 +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v4 +; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v5, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v11, v3 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v6, v9, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v7, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v6 :: v_dual_add_nc_u32 v2, v2, v0 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v18, v1, v5 :: v_dual_lshlrev_b32 v5, 16, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_dual_cndmask_b32 v33, v2, v6 :: v_dual_add_nc_u32 v2, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v19, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v0, v1, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v20 +; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v20, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v35, v0, v6, vcc_lo +; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v0, v1, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v21, v0, v1, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v1, v3, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7 +; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v37, v0, v6 :: v_dual_add_nc_u32 v0, 0x7fff, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v23, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v38, v0, v1, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v24, v1, v3 +; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v7 :: v_dual_add_f32 v3, 0x40c00000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v39, v0, v6 :: v_dual_add_nc_u32 v0, 0x7fff, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v25, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v48, v0, v1, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; GFX11-NEXT: v_cndmask_b32_e32 v49, v1, v3, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v49 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v28 +; GFX11-NEXT: v_dual_cndmask_b32 v26, v0, v6 :: v_dual_add_nc_u32 v1, v1, v4 +; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5 +; GFX11-NEXT: v_dual_cndmask_b32 v27, v0, v1 :: v_dual_lshlrev_b32 v6, 16, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v50, v0, v1, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v28, v1, v3 :: v_dual_add_f32 v3, 0x40c00000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v51, v0, v6, vcc_lo +; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v29, v0, v1 :: v_dual_lshlrev_b32 v6, 16, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v52, v0, v1 :: v_dual_add_nc_u32 v1, 0x7fff, v4 +; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v30, v1, v3 :: v_dual_add_f32 v3, 0x40c00000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v53, v0, v5 :: v_dual_add_f32 v0, 0x40c00000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v3 +; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-NEXT: s_lshl_b32 s0, s12, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_dual_cndmask_b32 v54, v1, v2 :: v_dual_add_nc_u32 v1, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v5, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v1, v2, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s13, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v1 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v55, v2, v3, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v6 +; GFX11-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v3 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: s_lshl_b32 s0, s15, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v4 +; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v64, v5, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v10 +; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_lshl_b32 s0, s16, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v65, v6, v7, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v65 +; GFX11-NEXT: v_cndmask_b32_e32 v66, v4, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_and_b32 s0, s17, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v67, v6, v7, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v11 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v4 +; GFX11-NEXT: v_bfe_u32 v8, v10, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v67 +; GFX11-NEXT: v_cndmask_b32_e32 v68, v5, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v10 +; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_lshl_b32 s0, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v5 +; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v69, v6, v7, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v11 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v10, v6 +; GFX11-NEXT: v_bfe_u32 v10, v12, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v70, v7, v8, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v12 +; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_lshl_b32 s0, s20, 16 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; GFX11-NEXT: v_and_or_b32 v5, 0xffff0000, v69, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v11, v7 +; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v13 +; GFX11-NEXT: v_bfe_u32 v71, v11, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v80, v8, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v13 +; GFX11-NEXT: s_lshl_b32 s0, s21, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v9, v15, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v9, v71, v11 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; GFX11-NEXT: v_bfe_u32 v71, v12, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v10 :: v_dual_add_nc_u32 v9, 0x7fff, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v10, v14, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s22, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v81, v9, v15, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v9, v71, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v14 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX11-NEXT: v_bfe_u32 v71, v11, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v14 +; GFX11-NEXT: v_bfe_u32 v83, v13, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v71, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v14, v83, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v82, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v71, 0x400000, v11 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: s_lshl_b32 s0, s23, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v13 +; GFX11-NEXT: v_bfe_u32 v83, v15, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v84, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v71, v12, v71, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v83, v15 +; GFX11-NEXT: v_bfe_u32 v13, v84, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v83, 0x400000, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v14, v82, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: v_add_f32_e64 v82, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v84 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_bfe_u32 v85, v14, 16, 1 +; GFX11-NEXT: v_bfe_u32 v86, v82, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s25, 0xffff0000 +; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v82 +; GFX11-NEXT: v_dual_cndmask_b32 v83, v12, v83 :: v_dual_add_nc_u32 v12, 0x7fff, v13 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v84 +; GFX11-NEXT: v_add_nc_u32_e32 v15, v85, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 +; GFX11-NEXT: v_add_nc_u32_e32 v85, v86, v82 +; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v12, v12, v13 :: v_dual_add_nc_u32 v15, 0x7fff, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v85, 0x7fff, v85 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s25, 16 +; GFX11-NEXT: v_and_or_b32 v6, 0xffff0000, v70, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v84, v15, v84, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 +; GFX11-NEXT: v_add_f32_e64 v87, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v86, v13, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v82, v85, v96, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v85, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s26, 16 +; GFX11-NEXT: v_bfe_u32 v15, v87, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v96, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-NEXT: v_bfe_u32 v97, v85, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v98, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s27, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v99, v96, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v100, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v113, 0x400000, v96 +; GFX11-NEXT: v_bfe_u32 v101, v98, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96 +; GFX11-NEXT: v_add_nc_u32_e32 v99, v99, v96 +; GFX11-NEXT: v_add_nc_u32_e32 v97, v97, v85 +; GFX11-NEXT: v_bfe_u32 v103, v100, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v101, v101, v98 +; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v98 +; GFX11-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v99 +; GFX11-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v97 +; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v85 +; GFX11-NEXT: v_add_nc_u32_e32 v101, 0x7fff, v101 +; GFX11-NEXT: v_add_nc_u32_e32 v103, v103, v100 +; GFX11-NEXT: v_cndmask_b32_e32 v96, v99, v113, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 +; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v87 +; GFX11-NEXT: v_add_nc_u32_e32 v14, v86, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v103 +; GFX11-NEXT: v_or_b32_e32 v103, 0x400000, v100 +; GFX11-NEXT: v_cndmask_b32_e32 v98, v101, v114, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v87 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v85, v97, v112, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v100, v100 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v96 +; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v64, v65 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v55, v69 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v97, v99, v103, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 +; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v28 +; GFX11-NEXT: v_and_or_b32 v4, 0xffff0000, v68, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v87, v15, v102, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v66, v67 +; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v27 +; GFX11-NEXT: v_and_or_b32 v29, 0xffff0000, v52, v55 +; GFX11-NEXT: v_and_or_b32 v28, 0xffff0000, v51, v64 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v14, v86, vcc_lo +; GFX11-NEXT: v_and_or_b32 v14, 0xffff0000, v85, v96 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 16, v87 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v11 +; GFX11-NEXT: v_and_or_b32 v27, 0xffff0000, v50, v65 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v98 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v82 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v10 +; GFX11-NEXT: v_and_or_b32 v10, 0xffff0000, v71, v87 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v81 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_or_b32 v30, 0xffff0000, v53, v54 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; GFX11-NEXT: v_and_or_b32 v25, 0xffff0000, v48, v49 +; GFX11-NEXT: v_and_or_b32 v24, 0xffff0000, v39, v50 +; GFX11-NEXT: v_and_or_b32 v23, 0xffff0000, v38, v51 +; GFX11-NEXT: v_and_or_b32 v22, 0xffff0000, v37, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX11-NEXT: v_and_or_b32 v15, 0xffff0000, v97, v98 +; GFX11-NEXT: v_and_or_b32 v13, 0xffff0000, v13, v85 +; GFX11-NEXT: v_and_or_b32 v12, 0xffff0000, v84, v82 +; GFX11-NEXT: v_and_or_b32 v11, 0xffff0000, v83, v86 +; GFX11-NEXT: v_and_or_b32 v9, 0xffff0000, v9, v96 +; GFX11-NEXT: v_and_or_b32 v8, 0xffff0000, v8, v71 +; GFX11-NEXT: v_and_or_b32 v7, 0xffff0000, v80, v7 +; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v1, v68 +; GFX11-NEXT: v_and_or_b32 v31, 0xffff0000, v31, v70 +; GFX11-NEXT: v_and_or_b32 v26, 0xffff0000, v26, v66 +; GFX11-NEXT: v_and_or_b32 v21, 0xffff0000, v21, v53 +; GFX11-NEXT: v_and_or_b32 v20, 0xffff0000, v35, v36 +; GFX11-NEXT: v_and_or_b32 v19, 0xffff0000, v34, v37 +; GFX11-NEXT: v_and_or_b32 v18, 0xffff0000, v33, v38 +; GFX11-NEXT: v_and_or_b32 v17, 0xffff0000, v32, v39 +; GFX11-NEXT: v_and_or_b32 v16, 0xffff0000, v16, v48 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB105_3: +; GFX11-NEXT: s_branch .LBB105_2 +; GFX11-NEXT: .LBB105_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i16_to_v64bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v38 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v55 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v54 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v35 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v18 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v16 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB52_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v33 -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v10 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: .LBB52_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB52_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v55, v10 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v54, v14 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v53, v12 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v16, v52, v16 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v18, v51, v18 -; GCN-NEXT: v_or_b32_e32 v20, v50, v20 -; GCN-NEXT: v_or_b32_e32 v8, v49, v8 -; GCN-NEXT: v_or_b32_e32 v22, v48, v22 -; GCN-NEXT: v_or_b32_e32 v24, v39, v24 -; GCN-NEXT: v_or_b32_e32 v26, v38, v26 -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v30, v31 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v31, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v32, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v32, v33 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v33, v29 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v33, v27 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v33, v25 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v33, v23 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v33, v21 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v33, v19 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v33, v17 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v33, v15 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v33, v13 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v33, v11 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v33, v9 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v33, v7 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v33, v5 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v33, v3 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v33, v1 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v33, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v34, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v30, vcc, s6, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, s6, v31 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v32, vcc, s6, v32 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v29 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v19 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v31 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v33 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v10 -; GCN-NEXT: .LBB52_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v44, v2, v3, 16 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 8, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v15, v2, v3, 16 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 12, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v11, v2, v3, 16 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v18, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v7, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v45, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v46, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v47, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v56, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v57, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v58, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v59 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v34, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v33, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v61 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v32, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v31, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v41, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x4c, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v30 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x50, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v38 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v6, v6, v8, 16 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x54, v0 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v39 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v10, 16 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x58, v0 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v48 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v10, v10, v12, 16 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x5c, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v49 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v12, v12, v14, 16 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x60, v0 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v50 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v14, v14, v50, 16 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x64, v0 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_alignbit_b32 v16, v51, v16, 16 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_alignbit_b32 v22, v52, v22, 16 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x6c, v0 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_alignbit_b32 v24, v53, v24, 16 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x70, v0 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_alignbit_b32 v26, v54, v26, 16 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x74, v0 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_alignbit_b32 v28, v55, v28, 16 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v18, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i16_to_v64bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB106_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v46, vcc, 0x30000, v3 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v50, v3 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v3 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v33, v3 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_or_b32_e32 v3, v31, v3 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v18, v11 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v54, v5 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v53, v7 +; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v52, v9 +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v53 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v30, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v16, v11 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v50 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v55 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v26, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v24, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v22, v9 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v20, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: .LBB106_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v64bf16: ; VI: ; %bb.0: @@ -114600,7 +244334,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB52_2 +; VI-NEXT: s_cbranch_execz .LBB106_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 3 ; VI-NEXT: v_add_u16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -114700,7 +244434,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_e32 v17, v17, v33 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB52_2: ; %end +; VI-NEXT: .LBB106_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -114715,7 +244449,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB52_2 +; GFX9-NEXT: s_cbranch_execz .LBB106_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -114750,7 +244484,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB52_2: ; %end +; GFX9-NEXT: .LBB106_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -114767,7 +244501,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-NEXT: s_cbranch_execz .LBB106_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -114802,7 +244536,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB52_2: ; %end +; GFX11-NEXT: .LBB106_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -114823,717 +244557,1996 @@ end: ret <64 x bfloat> %phi } +define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i16_to_v64bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v40, s30, 0 +; SI-NEXT: v_writelane_b32 v40, s31, 1 +; SI-NEXT: v_writelane_b32 v40, s34, 2 +; SI-NEXT: v_writelane_b32 v40, s35, 3 +; SI-NEXT: v_writelane_b32 v40, s36, 4 +; SI-NEXT: v_writelane_b32 v40, s37, 5 +; SI-NEXT: v_writelane_b32 v40, s38, 6 +; SI-NEXT: v_writelane_b32 v40, s39, 7 +; SI-NEXT: v_writelane_b32 v40, s48, 8 +; SI-NEXT: v_writelane_b32 v40, s49, 9 +; SI-NEXT: v_writelane_b32 v40, s50, 10 +; SI-NEXT: v_writelane_b32 v40, s51, 11 +; SI-NEXT: v_writelane_b32 v40, s52, 12 +; SI-NEXT: v_writelane_b32 v40, s53, 13 +; SI-NEXT: v_writelane_b32 v40, s54, 14 +; SI-NEXT: v_writelane_b32 v40, s55, 15 +; SI-NEXT: v_writelane_b32 v40, s64, 16 +; SI-NEXT: v_writelane_b32 v40, s65, 17 +; SI-NEXT: v_writelane_b32 v40, s66, 18 +; SI-NEXT: v_writelane_b32 v40, s67, 19 +; SI-NEXT: v_writelane_b32 v40, s68, 20 +; SI-NEXT: v_writelane_b32 v40, s69, 21 +; SI-NEXT: v_writelane_b32 v40, s70, 22 +; SI-NEXT: v_writelane_b32 v40, s71, 23 +; SI-NEXT: v_writelane_b32 v40, s80, 24 +; SI-NEXT: v_writelane_b32 v40, s81, 25 +; SI-NEXT: v_writelane_b32 v40, s82, 26 +; SI-NEXT: v_writelane_b32 v40, s83, 27 +; SI-NEXT: v_writelane_b32 v40, s84, 28 +; SI-NEXT: v_writelane_b32 v40, s85, 29 +; SI-NEXT: v_writelane_b32 v40, s86, 30 +; SI-NEXT: v_writelane_b32 v40, s87, 31 +; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane +; SI-NEXT: s_mov_b32 s60, s16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s17, 0 +; SI-NEXT: s_mov_b32 s61, s19 +; SI-NEXT: v_writelane_b32 v41, s60, 1 +; SI-NEXT: s_mov_b32 s63, s18 +; SI-NEXT: v_writelane_b32 v41, s61, 2 +; SI-NEXT: s_mov_b32 s72, s21 +; SI-NEXT: v_writelane_b32 v41, s63, 3 +; SI-NEXT: v_writelane_b32 v41, s72, 4 +; SI-NEXT: s_mov_b32 s74, s23 +; SI-NEXT: v_writelane_b32 v41, s20, 5 +; SI-NEXT: v_writelane_b32 v41, s74, 6 +; SI-NEXT: s_mov_b32 s75, s25 +; SI-NEXT: v_writelane_b32 v41, s22, 7 +; SI-NEXT: v_writelane_b32 v41, s75, 8 +; SI-NEXT: s_mov_b32 s76, s27 +; SI-NEXT: v_writelane_b32 v41, s24, 9 +; SI-NEXT: v_writelane_b32 v41, s76, 10 +; SI-NEXT: s_mov_b32 s93, s29 +; SI-NEXT: v_writelane_b32 v41, s26, 11 +; SI-NEXT: v_writelane_b32 v41, s93, 12 +; SI-NEXT: v_readfirstlane_b32 s16, v2 +; SI-NEXT: v_writelane_b32 v41, s28, 13 +; SI-NEXT: v_readfirstlane_b32 s73, v4 +; SI-NEXT: v_writelane_b32 v41, s16, 14 +; SI-NEXT: v_readfirstlane_b32 s89, v3 +; SI-NEXT: v_writelane_b32 v41, s73, 15 +; SI-NEXT: v_readfirstlane_b32 s90, v6 +; SI-NEXT: v_writelane_b32 v41, s89, 16 +; SI-NEXT: v_readfirstlane_b32 s91, v5 +; SI-NEXT: v_writelane_b32 v41, s90, 17 +; SI-NEXT: v_readfirstlane_b32 s34, v8 +; SI-NEXT: v_writelane_b32 v41, s91, 18 +; SI-NEXT: v_readfirstlane_b32 s35, v7 +; SI-NEXT: v_writelane_b32 v41, s34, 19 +; SI-NEXT: v_readfirstlane_b32 s36, v10 +; SI-NEXT: v_writelane_b32 v41, s35, 20 +; SI-NEXT: v_writelane_b32 v40, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s37, v9 +; SI-NEXT: v_writelane_b32 v41, s36, 21 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s62, v31 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s80, v32 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s69, v33 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s84, v34 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s68, v35 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s83, v36 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s87, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; SI-NEXT: v_readfirstlane_b32 s6, v37 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: v_writelane_b32 v40, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s38, v12 +; SI-NEXT: v_writelane_b32 v41, s37, 22 +; SI-NEXT: v_writelane_b32 v40, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s14, v30 +; SI-NEXT: v_readfirstlane_b32 s15, v29 +; SI-NEXT: v_readfirstlane_b32 s12, v28 +; SI-NEXT: v_readfirstlane_b32 s13, v27 +; SI-NEXT: v_readfirstlane_b32 s10, v26 +; SI-NEXT: v_readfirstlane_b32 s11, v25 +; SI-NEXT: v_readfirstlane_b32 s8, v24 +; SI-NEXT: v_readfirstlane_b32 s9, v23 +; SI-NEXT: v_readfirstlane_b32 s88, v22 +; SI-NEXT: v_readfirstlane_b32 s29, v21 +; SI-NEXT: v_readfirstlane_b32 s79, v20 +; SI-NEXT: v_readfirstlane_b32 s27, v19 +; SI-NEXT: v_readfirstlane_b32 s78, v18 +; SI-NEXT: v_readfirstlane_b32 s25, v17 +; SI-NEXT: v_readfirstlane_b32 s77, v16 +; SI-NEXT: v_readfirstlane_b32 s23, v15 +; SI-NEXT: v_readfirstlane_b32 s39, v14 +; SI-NEXT: v_readfirstlane_b32 s21, v13 +; SI-NEXT: v_readfirstlane_b32 s19, v11 +; SI-NEXT: v_readfirstlane_b32 s18, v1 +; SI-NEXT: v_writelane_b32 v41, s38, 23 +; SI-NEXT: v_writelane_b32 v40, s99, 35 +; SI-NEXT: v_writelane_b32 v41, s39, 24 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s58, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s59, v32 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s56, v33 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s57, v39 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s46, v48 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s47, v49 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s44, v50 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s45, v51 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s42, v34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s43, v35 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s40, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s41, v37 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB107_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s4, s60, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 25 +; SI-NEXT: s_lshl_b32 s4, s63, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 26 +; SI-NEXT: s_lshl_b32 s4, s20, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 27 +; SI-NEXT: s_lshl_b32 s4, s22, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 28 +; SI-NEXT: s_lshl_b32 s4, s24, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 29 +; SI-NEXT: s_lshl_b32 s4, s26, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 30 +; SI-NEXT: s_lshl_b32 s4, s28, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 31 +; SI-NEXT: s_lshl_b32 s4, s18, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 32 +; SI-NEXT: s_lshl_b32 s4, s89, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 33 +; SI-NEXT: s_lshl_b32 s4, s91, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 34 +; SI-NEXT: s_lshl_b32 s4, s35, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 35 +; SI-NEXT: s_lshl_b32 s4, s37, 16 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_lshl_b32 s96, s61, 16 +; SI-NEXT: s_lshl_b32 s99, s72, 16 +; SI-NEXT: s_lshl_b32 s97, s74, 16 +; SI-NEXT: s_lshl_b32 s92, s75, 16 +; SI-NEXT: s_lshl_b32 s94, s76, 16 +; SI-NEXT: s_lshl_b32 s95, s93, 16 +; SI-NEXT: s_lshl_b32 s93, s16, 16 +; SI-NEXT: s_lshl_b32 s30, s73, 16 +; SI-NEXT: s_lshl_b32 s31, s90, 16 +; SI-NEXT: s_lshl_b32 s34, s34, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 36 +; SI-NEXT: s_lshl_b32 s35, s36, 16 +; SI-NEXT: s_lshl_b32 s86, s19, 16 +; SI-NEXT: s_lshl_b32 s36, s38, 16 +; SI-NEXT: s_lshl_b32 s22, s21, 16 +; SI-NEXT: s_lshl_b32 s37, s39, 16 +; SI-NEXT: s_lshl_b32 s24, s23, 16 +; SI-NEXT: s_lshl_b32 s38, s77, 16 +; SI-NEXT: s_lshl_b32 s28, s25, 16 +; SI-NEXT: s_lshl_b32 s39, s78, 16 +; SI-NEXT: s_lshl_b32 s61, s27, 16 +; SI-NEXT: s_lshl_b32 s48, s79, 16 +; SI-NEXT: s_lshl_b32 s89, s29, 16 +; SI-NEXT: s_lshl_b32 s49, s88, 16 +; SI-NEXT: s_lshl_b32 s60, s9, 16 +; SI-NEXT: s_lshl_b32 s50, s8, 16 +; SI-NEXT: s_lshl_b32 s90, s11, 16 +; SI-NEXT: s_lshl_b32 s91, s10, 16 +; SI-NEXT: s_lshl_b32 s70, s13, 16 +; SI-NEXT: s_lshl_b32 s51, s12, 16 +; SI-NEXT: s_lshl_b32 s71, s15, 16 +; SI-NEXT: s_lshl_b32 s52, s14, 16 +; SI-NEXT: s_lshl_b32 s20, s41, 16 +; SI-NEXT: s_lshl_b32 s53, s40, 16 +; SI-NEXT: s_lshl_b32 s81, s43, 16 +; SI-NEXT: s_lshl_b32 s54, s42, 16 +; SI-NEXT: s_lshl_b32 s63, s45, 16 +; SI-NEXT: s_lshl_b32 s55, s44, 16 +; SI-NEXT: s_lshl_b32 s72, s47, 16 +; SI-NEXT: s_lshl_b32 s64, s46, 16 +; SI-NEXT: s_lshl_b32 s82, s57, 16 +; SI-NEXT: s_lshl_b32 s65, s56, 16 +; SI-NEXT: s_lshl_b32 s74, s59, 16 +; SI-NEXT: s_lshl_b32 s66, s58, 16 +; SI-NEXT: s_lshl_b32 s75, s87, 16 +; SI-NEXT: s_mov_b32 s73, s6 +; SI-NEXT: s_lshl_b32 s67, s6, 16 +; SI-NEXT: s_lshl_b32 s76, s83, 16 +; SI-NEXT: s_mov_b32 s16, s68 +; SI-NEXT: s_lshl_b32 s68, s68, 16 +; SI-NEXT: s_lshl_b32 s85, s84, 16 +; SI-NEXT: s_mov_b32 s98, s69 +; SI-NEXT: s_lshl_b32 s69, s69, 16 +; SI-NEXT: s_lshl_b32 s17, s80, 16 +; SI-NEXT: s_mov_b32 s6, s62 +; SI-NEXT: s_lshl_b32 s26, s62, 16 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB107_3 +; SI-NEXT: .LBB107_2: +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: s_mov_b32 s16, s68 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: s_mov_b32 s73, s6 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: s_mov_b32 s6, s62 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: s_mov_b32 s98, s69 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr81 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: .LBB107_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_mov_b32 s5, s17 +; SI-NEXT: s_mov_b32 s17, s86 +; SI-NEXT: s_mov_b32 s86, s7 +; SI-NEXT: s_cbranch_vccnz .LBB107_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 24 +; SI-NEXT: s_lshl_b32 s20, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 23 +; SI-NEXT: s_lshl_b32 s17, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 22 +; SI-NEXT: s_lshl_b32 s61, s16, 16 +; SI-NEXT: s_add_i32 s16, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 21 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 20 +; SI-NEXT: s_or_b32 s7, s7, s16 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_readlane_b32 s16, v41, 19 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: v_readlane_b32 s16, v41, 18 +; SI-NEXT: s_lshl_b32 s60, s98, 16 +; SI-NEXT: s_or_b32 s17, s17, s19 +; SI-NEXT: s_add_i32 s98, s16, 3 +; SI-NEXT: v_readlane_b32 s19, v41, 17 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_and_b32 s16, s98, 0xffff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s16, s19, s16 +; SI-NEXT: v_readlane_b32 s19, v41, 16 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_add_i32 s96, s19, 3 +; SI-NEXT: v_readlane_b32 s21, v41, 15 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s29, 0xffff +; SI-NEXT: s_lshl_b32 s11, s88, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_and_b32 s19, s96, 0xffff +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s11, s27, 0xffff +; SI-NEXT: s_lshl_b32 s13, s79, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s19, s21, s19 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_readlane_b32 s21, v41, 14 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_or_b32 s11, s13, s11 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s15, s78, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_or_b32 s13, s15, s13 +; SI-NEXT: s_and_b32 s15, s23, 0xffff +; SI-NEXT: s_lshl_b32 s22, s77, 16 +; SI-NEXT: s_or_b32 s18, s21, s18 +; SI-NEXT: v_readlane_b32 s21, v41, 13 +; SI-NEXT: s_or_b32 s15, s22, s15 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_readlane_b32 s22, v41, 12 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: v_readlane_b32 s22, v41, 11 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_readlane_b32 s23, v41, 10 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: v_readlane_b32 s23, v41, 9 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_readlane_b32 s24, v41, 8 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_or_b32 s23, s24, s23 +; SI-NEXT: v_readlane_b32 s24, v41, 7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_readlane_b32 s25, v41, 6 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: v_readlane_b32 s25, v41, 5 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_readlane_b32 s26, v41, 4 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s25, s26, s25 +; SI-NEXT: v_readlane_b32 s26, v41, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_readlane_b32 s27, v41, 2 +; SI-NEXT: s_and_b32 s26, s26, 0xffff +; SI-NEXT: s_lshl_b32 s27, s27, 16 +; SI-NEXT: s_or_b32 s26, s27, s26 +; SI-NEXT: v_readlane_b32 s27, v41, 1 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_readlane_b32 s28, v41, 0 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_or_b32 s27, s28, s27 +; SI-NEXT: s_add_i32 s27, s27, 0x30000 +; SI-NEXT: s_add_i32 s26, s26, 0x30000 +; SI-NEXT: s_and_b32 s86, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s27, 16 +; SI-NEXT: s_add_i32 s25, s25, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s27, 25 +; SI-NEXT: s_and_b32 s96, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_add_i32 s24, s24, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s26, 26 +; SI-NEXT: s_and_b32 s99, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: s_add_i32 s23, s23, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s25, 27 +; SI-NEXT: s_and_b32 s97, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_add_i32 s80, s80, 3 +; SI-NEXT: s_add_i32 s22, s22, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s24, 28 +; SI-NEXT: s_and_b32 s92, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s4, s80, 0xffff +; SI-NEXT: s_add_i32 s84, s84, 3 +; SI-NEXT: s_add_i32 s21, s21, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s23, 29 +; SI-NEXT: s_and_b32 s94, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s84, 0xffff +; SI-NEXT: s_add_i32 s83, s83, 3 +; SI-NEXT: s_add_i32 s18, s18, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s22, 30 +; SI-NEXT: s_and_b32 s95, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_or_b32 s5, s60, s5 +; SI-NEXT: s_and_b32 s60, s83, 0xffff +; SI-NEXT: s_add_i32 s87, s87, 3 +; SI-NEXT: s_add_i32 s59, s59, 3 +; SI-NEXT: s_add_i32 s57, s57, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s19, s19, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s21, 31 +; SI-NEXT: s_and_b32 s93, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s76, s61, s60 +; SI-NEXT: s_and_b32 s60, s87, 0xffff +; SI-NEXT: s_lshl_b32 s61, s73, 16 +; SI-NEXT: s_and_b32 s59, s59, 0xffff +; SI-NEXT: s_lshl_b32 s58, s58, 16 +; SI-NEXT: s_and_b32 s57, s57, 0xffff +; SI-NEXT: s_lshl_b32 s56, s56, 16 +; SI-NEXT: s_and_b32 s47, s47, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s42, s42, 16 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s40, s40, 16 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s18, 32 +; SI-NEXT: s_lshl_b32 s18, s19, 16 +; SI-NEXT: s_or_b32 s75, s61, s60 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_or_b32 s42, s42, s43 +; SI-NEXT: s_or_b32 s40, s40, s41 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s18, 33 +; SI-NEXT: s_and_b32 s31, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s76, s76, 0x30000 +; SI-NEXT: s_add_i32 s75, s75, 0x30000 +; SI-NEXT: s_add_i32 s58, s58, 0x30000 +; SI-NEXT: s_add_i32 s56, s56, 0x30000 +; SI-NEXT: s_add_i32 s46, s46, 0x30000 +; SI-NEXT: s_add_i32 s44, s44, 0x30000 +; SI-NEXT: s_add_i32 s42, s42, 0x30000 +; SI-NEXT: s_add_i32 s40, s40, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s20, s20, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s16, 34 +; SI-NEXT: s_and_b32 s34, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s30, s19, 0xffff0000 +; SI-NEXT: v_writelane_b32 v41, s6, 35 +; SI-NEXT: s_and_b32 s35, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s7, 16 +; SI-NEXT: s_and_b32 s36, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s37, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s20, 16 +; SI-NEXT: s_and_b32 s38, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s15, 16 +; SI-NEXT: s_and_b32 s39, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s13, 16 +; SI-NEXT: s_and_b32 s48, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s11, 16 +; SI-NEXT: s_and_b32 s49, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s89, s9, 16 +; SI-NEXT: s_and_b32 s50, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s8, 16 +; SI-NEXT: s_and_b32 s91, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s10, 16 +; SI-NEXT: s_and_b32 s51, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s70, s12, 16 +; SI-NEXT: s_and_b32 s52, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s71, s14, 16 +; SI-NEXT: s_and_b32 s53, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s40, 16 +; SI-NEXT: s_and_b32 s54, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s81, s42, 16 +; SI-NEXT: s_and_b32 s55, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s44, 16 +; SI-NEXT: s_and_b32 s64, s46, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s46, 16 +; SI-NEXT: s_and_b32 s65, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s82, s56, 16 +; SI-NEXT: s_and_b32 s66, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s58, 16 +; SI-NEXT: s_and_b32 s67, s75, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s75, 16 +; SI-NEXT: s_and_b32 s68, s76, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s76, 16 +; SI-NEXT: s_and_b32 s69, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s85, s5, 16 +; SI-NEXT: s_and_b32 s26, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s4, 16 +; SI-NEXT: v_writelane_b32 v41, s6, 36 +; SI-NEXT: .LBB107_5: ; %end +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 +; SI-NEXT: v_readlane_b32 s4, v41, 25 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96 +; SI-NEXT: v_readlane_b32 s4, v41, 26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 +; SI-NEXT: v_readlane_b32 s4, v41, 27 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 +; SI-NEXT: v_readlane_b32 s4, v41, 28 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_readlane_b32 s4, v41, 29 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_readlane_b32 s4, v41, 30 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_readlane_b32 s4, v41, 31 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 +; SI-NEXT: v_readlane_b32 s4, v41, 32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 +; SI-NEXT: v_readlane_b32 s4, v41, 33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 +; SI-NEXT: v_readlane_b32 s4, v41, 34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 +; SI-NEXT: v_readlane_b32 s4, v41, 35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_readlane_b32 s4, v41, 36 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s24 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s61 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s89 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s71 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s81 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s63 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s82 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s99, v40, 35 +; SI-NEXT: v_readlane_b32 s98, v40, 34 +; SI-NEXT: v_readlane_b32 s97, v40, 33 +; SI-NEXT: v_readlane_b32 s96, v40, 32 +; SI-NEXT: v_readlane_b32 s87, v40, 31 +; SI-NEXT: v_readlane_b32 s86, v40, 30 +; SI-NEXT: v_readlane_b32 s85, v40, 29 +; SI-NEXT: v_readlane_b32 s84, v40, 28 +; SI-NEXT: v_readlane_b32 s83, v40, 27 +; SI-NEXT: v_readlane_b32 s82, v40, 26 +; SI-NEXT: v_readlane_b32 s81, v40, 25 +; SI-NEXT: v_readlane_b32 s80, v40, 24 +; SI-NEXT: v_readlane_b32 s71, v40, 23 +; SI-NEXT: v_readlane_b32 s70, v40, 22 +; SI-NEXT: v_readlane_b32 s69, v40, 21 +; SI-NEXT: v_readlane_b32 s68, v40, 20 +; SI-NEXT: v_readlane_b32 s67, v40, 19 +; SI-NEXT: v_readlane_b32 s66, v40, 18 +; SI-NEXT: v_readlane_b32 s65, v40, 17 +; SI-NEXT: v_readlane_b32 s64, v40, 16 +; SI-NEXT: v_readlane_b32 s55, v40, 15 +; SI-NEXT: v_readlane_b32 s54, v40, 14 +; SI-NEXT: v_readlane_b32 s53, v40, 13 +; SI-NEXT: v_readlane_b32 s52, v40, 12 +; SI-NEXT: v_readlane_b32 s51, v40, 11 +; SI-NEXT: v_readlane_b32 s50, v40, 10 +; SI-NEXT: v_readlane_b32 s49, v40, 9 +; SI-NEXT: v_readlane_b32 s48, v40, 8 +; SI-NEXT: v_readlane_b32 s39, v40, 7 +; SI-NEXT: v_readlane_b32 s38, v40, 6 +; SI-NEXT: v_readlane_b32 s37, v40, 5 +; SI-NEXT: v_readlane_b32 s36, v40, 4 +; SI-NEXT: v_readlane_b32 s35, v40, 3 +; SI-NEXT: v_readlane_b32 s34, v40, 2 +; SI-NEXT: v_readlane_b32 s31, v40, 1 +; SI-NEXT: v_readlane_b32 s30, v40, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i16_to_v64bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v32, s30, 0 +; VI-NEXT: v_writelane_b32 v32, s31, 1 +; VI-NEXT: v_writelane_b32 v32, s34, 2 +; VI-NEXT: v_writelane_b32 v32, s35, 3 +; VI-NEXT: v_writelane_b32 v32, s36, 4 +; VI-NEXT: v_writelane_b32 v32, s37, 5 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_writelane_b32 v32, s38, 6 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: v_readfirstlane_b32 s46, v3 +; VI-NEXT: v_readfirstlane_b32 s45, v4 +; VI-NEXT: v_readfirstlane_b32 s44, v5 +; VI-NEXT: v_readfirstlane_b32 s43, v6 +; VI-NEXT: v_readfirstlane_b32 s42, v7 +; VI-NEXT: v_readfirstlane_b32 s41, v8 +; VI-NEXT: v_readfirstlane_b32 s40, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 +; VI-NEXT: v_readfirstlane_b32 s13, v12 +; VI-NEXT: v_readfirstlane_b32 s12, v13 +; VI-NEXT: v_readfirstlane_b32 s11, v14 +; VI-NEXT: v_readfirstlane_b32 s10, v15 +; VI-NEXT: v_readfirstlane_b32 s9, v16 +; VI-NEXT: v_readfirstlane_b32 s8, v17 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_writelane_b32 v32, s39, 7 +; VI-NEXT: s_cbranch_scc0 .LBB107_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB107_3 +; VI-NEXT: .LBB107_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s47, 3 +; VI-NEXT: s_and_b32 s47, s46, 0xffff0000 +; VI-NEXT: s_add_i32 s46, s46, 3 +; VI-NEXT: s_and_b32 s56, s45, 0xffff0000 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_and_b32 s57, s44, 0xffff0000 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_and_b32 s58, s43, 0xffff0000 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_and_b32 s59, s42, 0xffff0000 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_and_b32 s60, s41, 0xffff0000 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_and_b32 s61, s40, 0xffff0000 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_and_b32 s62, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_and_b32 s63, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_and_b32 s72, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_and_b32 s73, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_and_b32 s74, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_and_b32 s75, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_and_b32 s76, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s77, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_and_b32 s78, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s79, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s88, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s89, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s90, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s91, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_and_b32 vcc_lo, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_and_b32 vcc_hi, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_and_b32 s30, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s31, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s34, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s35, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s36, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s37, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s38, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s39, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s40, s40, 0xffff +; VI-NEXT: s_and_b32 s41, s41, 0xffff +; VI-NEXT: s_and_b32 s42, s42, 0xffff +; VI-NEXT: s_and_b32 s43, s43, 0xffff +; VI-NEXT: s_and_b32 s44, s44, 0xffff +; VI-NEXT: s_and_b32 s45, s45, 0xffff +; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s7, s39, s7 +; VI-NEXT: s_or_b32 s6, s38, s6 +; VI-NEXT: s_or_b32 s29, s37, s29 +; VI-NEXT: s_or_b32 s28, s36, s28 +; VI-NEXT: s_or_b32 s27, s35, s27 +; VI-NEXT: s_or_b32 s26, s34, s26 +; VI-NEXT: s_or_b32 s25, s31, s25 +; VI-NEXT: s_or_b32 s24, s30, s24 +; VI-NEXT: s_or_b32 s23, vcc_hi, s23 +; VI-NEXT: s_or_b32 s22, vcc_lo, s22 +; VI-NEXT: s_or_b32 s21, s91, s21 +; VI-NEXT: s_or_b32 s20, s90, s20 +; VI-NEXT: s_or_b32 s19, s89, s19 +; VI-NEXT: s_or_b32 s18, s88, s18 +; VI-NEXT: s_or_b32 s17, s79, s17 +; VI-NEXT: s_or_b32 s16, s78, s16 +; VI-NEXT: s_or_b32 s8, s77, s8 +; VI-NEXT: s_or_b32 s9, s76, s9 +; VI-NEXT: s_or_b32 s10, s75, s10 +; VI-NEXT: s_or_b32 s11, s74, s11 +; VI-NEXT: s_or_b32 s12, s73, s12 +; VI-NEXT: s_or_b32 s13, s72, s13 +; VI-NEXT: s_or_b32 s14, s63, s14 +; VI-NEXT: s_or_b32 s15, s62, s15 +; VI-NEXT: s_or_b32 s40, s61, s40 +; VI-NEXT: s_or_b32 s41, s60, s41 +; VI-NEXT: s_or_b32 s42, s59, s42 +; VI-NEXT: s_or_b32 s43, s58, s43 +; VI-NEXT: s_or_b32 s44, s57, s44 +; VI-NEXT: s_or_b32 s45, s56, s45 +; VI-NEXT: s_or_b32 s46, s47, s46 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_add_i32 s25, s25, 0x30000 +; VI-NEXT: s_add_i32 s24, s24, 0x30000 +; VI-NEXT: s_add_i32 s23, s23, 0x30000 +; VI-NEXT: s_add_i32 s22, s22, 0x30000 +; VI-NEXT: s_add_i32 s21, s21, 0x30000 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s40, s40, 0x30000 +; VI-NEXT: s_add_i32 s41, s41, 0x30000 +; VI-NEXT: s_add_i32 s42, s42, 0x30000 +; VI-NEXT: s_add_i32 s43, s43, 0x30000 +; VI-NEXT: s_add_i32 s44, s44, 0x30000 +; VI-NEXT: s_add_i32 s45, s45, 0x30000 +; VI-NEXT: s_add_i32 s46, s46, 0x30000 +; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: .LBB107_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: v_mov_b32_e32 v16, s47 +; VI-NEXT: v_mov_b32_e32 v17, s46 +; VI-NEXT: v_mov_b32_e32 v18, s45 +; VI-NEXT: v_mov_b32_e32 v19, s44 +; VI-NEXT: v_mov_b32_e32 v20, s43 +; VI-NEXT: v_mov_b32_e32 v21, s42 +; VI-NEXT: v_mov_b32_e32 v22, s41 +; VI-NEXT: v_mov_b32_e32 v23, s40 +; VI-NEXT: v_mov_b32_e32 v24, s15 +; VI-NEXT: v_mov_b32_e32 v25, s14 +; VI-NEXT: v_mov_b32_e32 v26, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v28, s11 +; VI-NEXT: v_mov_b32_e32 v29, s10 +; VI-NEXT: v_mov_b32_e32 v30, s9 +; VI-NEXT: v_mov_b32_e32 v31, s8 +; VI-NEXT: v_readlane_b32 s39, v32, 7 +; VI-NEXT: v_readlane_b32 s38, v32, 6 +; VI-NEXT: v_readlane_b32 s37, v32, 5 +; VI-NEXT: v_readlane_b32 s36, v32, 4 +; VI-NEXT: v_readlane_b32 s35, v32, 3 +; VI-NEXT: v_readlane_b32 s34, v32, 2 +; VI-NEXT: v_readlane_b32 s31, v32, 1 +; VI-NEXT: v_readlane_b32 s30, v32, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB107_4: +; VI-NEXT: s_branch .LBB107_2 +; +; GFX9-LABEL: bitcast_v64i16_to_v64bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB107_3 +; GFX9-NEXT: .LBB107_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB107_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB107_4: +; GFX9-NEXT: s_branch .LBB107_2 +; +; GFX11-LABEL: bitcast_v64i16_to_v64bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB107_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB107_4 +; GFX11-NEXT: .LBB107_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB107_3: +; GFX11-NEXT: s_branch .LBB107_2 +; GFX11-NEXT: .LBB107_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v64f16_to_v64i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v19 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:120 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v53 -; GCN-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v15 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v51 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v40 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v9 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v29 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v56 -; GCN-NEXT: v_mov_b32_e32 v49, v57 -; GCN-NEXT: v_mov_b32_e32 v54, v58 -; GCN-NEXT: v_mov_b32_e32 v51, v62 -; GCN-NEXT: v_mov_b32_e32 v48, v4 -; GCN-NEXT: v_mov_b32_e32 v36, v5 -; GCN-NEXT: v_mov_b32_e32 v46, v6 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v12 -; GCN-NEXT: v_or_b32_e32 v9, v9, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v13 -; GCN-NEXT: v_or_b32_e32 v11, v11, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v17 -; GCN-NEXT: v_or_b32_e32 v16, v16, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; GCN-NEXT: v_or_b32_e32 v3, v3, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; GCN-NEXT: v_or_b32_e32 v4, v5, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v10 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v8 -; GCN-NEXT: v_or_b32_e32 v5, v7, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v46 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v31 -; GCN-NEXT: v_or_b32_e32 v6, v10, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v32 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v33 -; GCN-NEXT: v_or_b32_e32 v32, v29, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v34 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; GCN-NEXT: v_or_b32_e32 v34, v29, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; GCN-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; GCN-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; GCN-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; GCN-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; GCN-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; GCN-NEXT: v_add_f32_e32 v61, 0x38000000, v61 -; GCN-NEXT: v_add_f32_e32 v63, 0x38000000, v63 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v57, 0x38000000, v57 -; GCN-NEXT: v_add_f32_e32 v62, 0x38000000, v62 -; GCN-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; GCN-NEXT: v_add_f32_e32 v60, 0x38000000, v60 -; GCN-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; GCN-NEXT: v_add_f32_e32 v59, 0x38000000, v59 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_add_f32_e32 v47, 0x38000000, v47 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v30, v30, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v61, v38, v36 -; GCN-NEXT: v_or_b32_e32 v49, v49, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v52, v51 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v42, v42, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v46, v44 -; GCN-NEXT: v_mov_b32_e32 v46, v6 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v58, v56 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v63, v2 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v57, v28 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v45, v45, v62 -; GCN-NEXT: v_or_b32_e32 v41, v41, v60 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v53, v59 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v50, v50, v47 -; GCN-NEXT: v_or_b32_e32 v39, v39, v43 -; GCN-NEXT: v_or_b32_e32 v37, v37, v40 -; GCN-NEXT: v_or_b32_e32 v1, v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: v_alignbit_b32 v63, v34, v29, 16 -; GCN-NEXT: v_alignbit_b32 v36, v32, v36, 16 -; GCN-NEXT: v_alignbit_b32 v48, v46, v48, 16 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_alignbit_b32 v51, v5, v51, 16 -; GCN-NEXT: v_mov_b32_e32 v7, v4 -; GCN-NEXT: v_alignbit_b32 v54, v4, v54, 16 -; GCN-NEXT: v_alignbit_b32 v29, v3, v44, 16 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; GCN-NEXT: v_alignbit_b32 v29, v1, v56, 16 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v2, v26, v2, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v28, v24, v28, 16 -; GCN-NEXT: v_alignbit_b32 v53, v22, v62, 16 -; GCN-NEXT: v_alignbit_b32 v60, v20, v60, 16 -; GCN-NEXT: v_alignbit_b32 v59, v18, v59, 16 -; GCN-NEXT: v_alignbit_b32 v47, v16, v47, 16 -; GCN-NEXT: v_alignbit_b32 v43, v14, v43, 16 -; GCN-NEXT: v_alignbit_b32 v40, v11, v40, 16 -; GCN-NEXT: v_alignbit_b32 v55, v9, v55, 16 -; GCN-NEXT: .LBB53_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v30, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v35 -; GCN-NEXT: v_or_b32_e32 v2, v2, v29 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v61 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; GCN-NEXT: v_or_b32_e32 v56, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GCN-NEXT: v_or_b32_e32 v44, v1, v2 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; GCN-NEXT: v_or_b32_e32 v61, v1, v2 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; GCN-NEXT: v_or_b32_e32 v58, v1, v2 -; GCN-NEXT: v_add_i32_e32 v63, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; GCN-NEXT: v_or_b32_e32 v57, v1, v2 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; GCN-NEXT: v_or_b32_e32 v10, v1, v2 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; GCN-NEXT: v_or_b32_e32 v31, v1, v2 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v7, v1, v2 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v33, v1, v2 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v5, v1, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v35, v1, v2 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v1, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v1, v1, v30 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v53 -; GCN-NEXT: v_or_b32_e32 v49, v30, v49 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v60 -; GCN-NEXT: v_or_b32_e32 v45, v30, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v59 -; GCN-NEXT: v_or_b32_e32 v53, v30, v53 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v47 -; GCN-NEXT: v_or_b32_e32 v50, v30, v50 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v43 -; GCN-NEXT: v_or_b32_e32 v39, v30, v39 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v40 -; GCN-NEXT: v_or_b32_e32 v30, v30, v37 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v11, v11, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v37, v37, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v9, v9, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v56, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v63, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64f16_to_v64i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v51 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:104 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v58 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v60 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v63 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v45, v6 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v8 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v41, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v36 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v36 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_mov_b32_e32 v59, v29 +; SI-NEXT: v_mov_b32_e32 v29, v27 +; SI-NEXT: v_mov_b32_e32 v57, v23 +; SI-NEXT: v_mov_b32_e32 v60, v2 +; SI-NEXT: v_mov_b32_e32 v62, v3 +; SI-NEXT: v_mov_b32_e32 v63, v4 +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB108_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v10 +; SI-NEXT: v_or_b32_e32 v6, v6, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v33, v33, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v61 +; SI-NEXT: v_or_b32_e32 v58, v3, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 +; SI-NEXT: v_or_b32_e32 v24, v24, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v28 +; SI-NEXT: v_or_b32_e32 v31, v31, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_or_b32_e32 v18, v18, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_or_b32_e32 v39, v36, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_or_b32_e32 v52, v36, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_or_b32_e32 v55, v36, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_or_b32_e32 v43, v36, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v38, v38, v47 +; SI-NEXT: v_or_b32_e32 v54, v54, v42 +; SI-NEXT: v_or_b32_e32 v49, v49, v51 +; SI-NEXT: v_or_b32_e32 v45, v45, v50 +; SI-NEXT: v_or_b32_e32 v41, v41, v30 +; SI-NEXT: v_or_b32_e32 v46, v46, v32 +; SI-NEXT: v_alignbit_b32 v47, v16, v47, 16 +; SI-NEXT: v_alignbit_b32 v42, v11, v42, 16 +; SI-NEXT: v_alignbit_b32 v51, v58, v51, 16 +; SI-NEXT: v_alignbit_b32 v50, v14, v50, 16 +; SI-NEXT: v_alignbit_b32 v30, v7, v30, 16 +; SI-NEXT: v_alignbit_b32 v32, v6, v32, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 +; SI-NEXT: v_or_b32_e32 v2, v36, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v43, v34, 16 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v37, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v63 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v55, v1, 16 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v63, v37, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v9, v39, v9, 16 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v29, v29, v23 +; SI-NEXT: v_or_b32_e32 v2, v36, v5 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v5, v52, v5, 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: v_mov_b32_e32 v2, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v62 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_or_b32_e32 v62, v56, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v60 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_or_b32_e32 v60, v56, v37 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_or_b32_e32 v57, v56, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v59 +; SI-NEXT: v_alignbit_b32 v26, v31, v26, 16 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_or_b32_e32 v59, v56, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 +; SI-NEXT: v_alignbit_b32 v2, v18, v36, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, v19, v37, 16 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, v24, v23, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v56, v56, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, v21, v27, 16 +; SI-NEXT: v_alignbit_b32 v35, v33, v35, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: .LBB108_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v36, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v34, v36, v34 +; SI-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v44 +; SI-NEXT: v_or_b32_e32 v34, v34, v36 +; SI-NEXT: v_add_i32_e32 v36, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v34, v36, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_add_i32_e32 v34, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v64i16: ; VI: ; %bb.0: @@ -115545,7 +246558,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB53_2 +; VI-NEXT: s_cbranch_execz .LBB108_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 0x200 ; VI-NEXT: v_add_f16_e32 v33, 0x200, v15 @@ -115645,7 +246658,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 ; VI-NEXT: v_or_b32_e32 v17, v33, v17 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB53_2: ; %end +; VI-NEXT: .LBB108_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -115660,7 +246673,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB53_2 +; GFX9-NEXT: s_cbranch_execz .LBB108_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -115696,7 +246709,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB53_2: ; %end +; GFX9-NEXT: .LBB108_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -115713,7 +246726,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB53_2 +; GFX11-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -115748,7 +246761,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB53_2: ; %end +; GFX11-NEXT: .LBB108_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -115769,1080 +246782,2076 @@ end: ret <64 x i16> %phi } +define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64f16_to_v64i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s27 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v9, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v48 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v51, v55 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v40 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v46, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v47, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v56, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v45, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB109_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB109_3 +; SI-NEXT: .LBB109_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_or_b32_e32 v11, v11, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_or_b32_e32 v18, v18, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v31, v31, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v33 +; SI-NEXT: v_or_b32_e32 v32, v32, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v36 +; SI-NEXT: v_or_b32_e32 v35, v35, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v39 +; SI-NEXT: v_or_b32_e32 v38, v38, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, v51 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v49, v48, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v53 +; SI-NEXT: v_or_b32_e32 v52, v48, v51 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v29, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v59 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v46, v48, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v57 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v61 +; SI-NEXT: v_or_b32_e32 v57, v48, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v63 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v45 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v60, v48, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v62 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_or_b32_e32 v59, v54, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v56 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_or_b32_e32 v56, v54, v48 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v45, v40, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_or_b32_e32 v7, v41, v55 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v14, v14, v13 +; SI-NEXT: v_or_b32_e32 v23, v23, v17 +; SI-NEXT: v_or_b32_e32 v34, v34, v21 +; SI-NEXT: v_alignbit_b32 v4, v57, v4, 16 +; SI-NEXT: v_alignbit_b32 v63, v46, v51, 16 +; SI-NEXT: v_alignbit_b32 v62, v29, v48, 16 +; SI-NEXT: v_alignbit_b32 v61, v52, v54, 16 +; SI-NEXT: v_alignbit_b32 v44, v49, v55, 16 +; SI-NEXT: v_alignbit_b32 v13, v32, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v2, v21, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v7, v7, v40 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v41, v10 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v10, v35, v10, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v7 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v7, v41, v20 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v20, v31, v20, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v7 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v7, v41, v28 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v28, v15, v28, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v7 +; SI-NEXT: v_or_b32_e32 v7, v41, v27 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v27, v11, v27, 16 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v41, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v43, v42, v24 +; SI-NEXT: v_alignbit_b32 v26, v8, v26, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v41, v37 +; SI-NEXT: v_mov_b32_e32 v51, v7 +; SI-NEXT: v_alignbit_b32 v7, v38, v40, 16 +; SI-NEXT: v_alignbit_b32 v24, v5, v24, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v37, v1, v37, 16 +; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: v_and_b32_e32 v48, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v58 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_add_i32_e32 v48, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v4, v48, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v63 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_add_i32_e32 v48, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v4, v48, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v47 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_add_i32_e32 v48, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v4, v48, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_add_i32_e32 v48, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v4, v48, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v53 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v17 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB109_4: +; SI-NEXT: s_branch .LBB109_2 +; +; VI-LABEL: bitcast_v64f16_to_v64i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB109_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB109_3 +; VI-NEXT: .LBB109_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v18, 0x200 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v15 +; VI-NEXT: v_add_f16_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v33, v15 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v14 +; VI-NEXT: v_add_f16_sdwa v14, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v33, v14 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v13 +; VI-NEXT: v_add_f16_sdwa v13, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v33, v13 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v12 +; VI-NEXT: v_add_f16_sdwa v12, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v33, v12 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v11 +; VI-NEXT: v_add_f16_sdwa v11, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v33, v11 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v10 +; VI-NEXT: v_add_f16_sdwa v10, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v33, v10 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v9 +; VI-NEXT: v_add_f16_sdwa v9, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v33, v9 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v8, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v33, v8 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v7, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v33, v7 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v6, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v33, v6 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v5, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v33, v5 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v33, v4 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v3, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v33, v3 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v33, v2 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v33, v1 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v33, v0 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v31 +; VI-NEXT: v_add_f16_sdwa v31, v31, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v31, v33, v31 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v30 +; VI-NEXT: v_add_f16_sdwa v30, v30, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v33, v30 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v29 +; VI-NEXT: v_add_f16_sdwa v29, v29, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v33, v29 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v28 +; VI-NEXT: v_add_f16_sdwa v28, v28, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v33, v28 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v27 +; VI-NEXT: v_add_f16_sdwa v27, v27, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v33, v27 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v26 +; VI-NEXT: v_add_f16_sdwa v26, v26, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v33, v26 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v25 +; VI-NEXT: v_add_f16_sdwa v25, v25, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v33, v25 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v24 +; VI-NEXT: v_add_f16_sdwa v24, v24, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v33, v24 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v23 +; VI-NEXT: v_add_f16_sdwa v23, v23, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v33, v23 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v22 +; VI-NEXT: v_add_f16_sdwa v22, v22, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v33, v22 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v21 +; VI-NEXT: v_add_f16_sdwa v21, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v33, v21 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v20 +; VI-NEXT: v_add_f16_sdwa v20, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v33, v20 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v19 +; VI-NEXT: v_add_f16_sdwa v19, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v33, v19 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v32 +; VI-NEXT: v_add_f16_sdwa v32, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v32, v33, v32 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v18, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v33, v17 +; VI-NEXT: v_or_b32_e32 v16, v16, v18 +; VI-NEXT: .LBB109_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB109_4: +; VI-NEXT: s_branch .LBB109_2 +; +; GFX9-LABEL: bitcast_v64f16_to_v64i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB109_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB109_3 +; GFX9-NEXT: .LBB109_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v31, v31, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, v32, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB109_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB109_4: +; GFX9-NEXT: s_branch .LBB109_2 +; +; GFX11-LABEL: bitcast_v64f16_to_v64i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB109_4 +; GFX11-NEXT: .LBB109_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB109_3: +; GFX11-NEXT: s_branch .LBB109_2 +; GFX11-NEXT: .LBB109_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i16_to_v64f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB54_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v40 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: .LBB54_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB54_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v63 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v63, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v40 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: .LBB54_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v45, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v44, v2, v1 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v2, v1 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v46, v2, v1 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 20, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v45, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i16_to_v64f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: .LBB110_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: .LBB110_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v64f16: ; VI: ; %bb.0: @@ -116854,7 +248863,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB54_2 +; VI-NEXT: s_cbranch_execz .LBB110_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 3 ; VI-NEXT: v_add_u16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -116954,7 +248963,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_e32 v17, v17, v33 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB54_2: ; %end +; VI-NEXT: .LBB110_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -116969,7 +248978,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB54_2 +; GFX9-NEXT: s_cbranch_execz .LBB110_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -117004,7 +249013,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB54_2: ; %end +; GFX9-NEXT: .LBB110_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -117021,7 +249030,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-NEXT: s_cbranch_execz .LBB110_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -117056,7 +249065,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB54_2: ; %end +; GFX11-NEXT: .LBB110_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -117076,3 +249085,1288 @@ end: %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <64 x half> %phi } + +define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i16_to_v64f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 +; SI-NEXT: v_mov_b32_e32 v42, v4 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v41 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB111_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s29 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s20 +; SI-NEXT: v_mov_b32_e32 v3, v10 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s24 +; SI-NEXT: v_mov_b32_e32 v24, v43 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s25 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 +; SI-NEXT: v_mov_b32_e32 v61, v30 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v48 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_mov_b32_e32 v25, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v23 +; SI-NEXT: v_mov_b32_e32 v23, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v34 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 +; SI-NEXT: s_branch .LBB111_3 +; SI-NEXT: .LBB111_2: +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: v_mov_b32_e32 v61, v30 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: v_mov_b32_e32 v3, v10 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: .LBB111_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v42 +; SI-NEXT: v_mov_b32_e32 v42, v44 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: v_mov_b32_e32 v46, v56 +; SI-NEXT: v_mov_b32_e32 v56, v58 +; SI-NEXT: v_mov_b32_e32 v58, v5 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: v_mov_b32_e32 v9, v11 +; SI-NEXT: v_mov_b32_e32 v11, v13 +; SI-NEXT: v_mov_b32_e32 v13, v15 +; SI-NEXT: v_mov_b32_e32 v15, v17 +; SI-NEXT: v_mov_b32_e32 v17, v19 +; SI-NEXT: v_mov_b32_e32 v19, v1 +; SI-NEXT: s_cbranch_vccnz .LBB111_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v2 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s18 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s20 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s21 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s24 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s25 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v60 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: .LBB111_5: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i16_to_v64f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v32, s30, 0 +; VI-NEXT: v_writelane_b32 v32, s31, 1 +; VI-NEXT: v_writelane_b32 v32, s34, 2 +; VI-NEXT: v_writelane_b32 v32, s35, 3 +; VI-NEXT: v_writelane_b32 v32, s36, 4 +; VI-NEXT: v_writelane_b32 v32, s37, 5 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_writelane_b32 v32, s38, 6 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: v_readfirstlane_b32 s46, v3 +; VI-NEXT: v_readfirstlane_b32 s45, v4 +; VI-NEXT: v_readfirstlane_b32 s44, v5 +; VI-NEXT: v_readfirstlane_b32 s43, v6 +; VI-NEXT: v_readfirstlane_b32 s42, v7 +; VI-NEXT: v_readfirstlane_b32 s41, v8 +; VI-NEXT: v_readfirstlane_b32 s40, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 +; VI-NEXT: v_readfirstlane_b32 s13, v12 +; VI-NEXT: v_readfirstlane_b32 s12, v13 +; VI-NEXT: v_readfirstlane_b32 s11, v14 +; VI-NEXT: v_readfirstlane_b32 s10, v15 +; VI-NEXT: v_readfirstlane_b32 s9, v16 +; VI-NEXT: v_readfirstlane_b32 s8, v17 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_writelane_b32 v32, s39, 7 +; VI-NEXT: s_cbranch_scc0 .LBB111_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB111_3 +; VI-NEXT: .LBB111_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s47, 3 +; VI-NEXT: s_and_b32 s47, s46, 0xffff0000 +; VI-NEXT: s_add_i32 s46, s46, 3 +; VI-NEXT: s_and_b32 s56, s45, 0xffff0000 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_and_b32 s57, s44, 0xffff0000 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_and_b32 s58, s43, 0xffff0000 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_and_b32 s59, s42, 0xffff0000 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_and_b32 s60, s41, 0xffff0000 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_and_b32 s61, s40, 0xffff0000 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_and_b32 s62, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_and_b32 s63, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_and_b32 s72, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_and_b32 s73, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_and_b32 s74, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_and_b32 s75, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_and_b32 s76, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s77, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_and_b32 s78, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s79, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s88, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s89, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s90, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s91, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_and_b32 vcc_lo, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_and_b32 vcc_hi, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_and_b32 s30, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s31, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s34, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s35, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s36, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s37, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s38, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s39, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s40, s40, 0xffff +; VI-NEXT: s_and_b32 s41, s41, 0xffff +; VI-NEXT: s_and_b32 s42, s42, 0xffff +; VI-NEXT: s_and_b32 s43, s43, 0xffff +; VI-NEXT: s_and_b32 s44, s44, 0xffff +; VI-NEXT: s_and_b32 s45, s45, 0xffff +; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s7, s39, s7 +; VI-NEXT: s_or_b32 s6, s38, s6 +; VI-NEXT: s_or_b32 s29, s37, s29 +; VI-NEXT: s_or_b32 s28, s36, s28 +; VI-NEXT: s_or_b32 s27, s35, s27 +; VI-NEXT: s_or_b32 s26, s34, s26 +; VI-NEXT: s_or_b32 s25, s31, s25 +; VI-NEXT: s_or_b32 s24, s30, s24 +; VI-NEXT: s_or_b32 s23, vcc_hi, s23 +; VI-NEXT: s_or_b32 s22, vcc_lo, s22 +; VI-NEXT: s_or_b32 s21, s91, s21 +; VI-NEXT: s_or_b32 s20, s90, s20 +; VI-NEXT: s_or_b32 s19, s89, s19 +; VI-NEXT: s_or_b32 s18, s88, s18 +; VI-NEXT: s_or_b32 s17, s79, s17 +; VI-NEXT: s_or_b32 s16, s78, s16 +; VI-NEXT: s_or_b32 s8, s77, s8 +; VI-NEXT: s_or_b32 s9, s76, s9 +; VI-NEXT: s_or_b32 s10, s75, s10 +; VI-NEXT: s_or_b32 s11, s74, s11 +; VI-NEXT: s_or_b32 s12, s73, s12 +; VI-NEXT: s_or_b32 s13, s72, s13 +; VI-NEXT: s_or_b32 s14, s63, s14 +; VI-NEXT: s_or_b32 s15, s62, s15 +; VI-NEXT: s_or_b32 s40, s61, s40 +; VI-NEXT: s_or_b32 s41, s60, s41 +; VI-NEXT: s_or_b32 s42, s59, s42 +; VI-NEXT: s_or_b32 s43, s58, s43 +; VI-NEXT: s_or_b32 s44, s57, s44 +; VI-NEXT: s_or_b32 s45, s56, s45 +; VI-NEXT: s_or_b32 s46, s47, s46 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_add_i32 s25, s25, 0x30000 +; VI-NEXT: s_add_i32 s24, s24, 0x30000 +; VI-NEXT: s_add_i32 s23, s23, 0x30000 +; VI-NEXT: s_add_i32 s22, s22, 0x30000 +; VI-NEXT: s_add_i32 s21, s21, 0x30000 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s40, s40, 0x30000 +; VI-NEXT: s_add_i32 s41, s41, 0x30000 +; VI-NEXT: s_add_i32 s42, s42, 0x30000 +; VI-NEXT: s_add_i32 s43, s43, 0x30000 +; VI-NEXT: s_add_i32 s44, s44, 0x30000 +; VI-NEXT: s_add_i32 s45, s45, 0x30000 +; VI-NEXT: s_add_i32 s46, s46, 0x30000 +; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: .LBB111_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: v_mov_b32_e32 v16, s47 +; VI-NEXT: v_mov_b32_e32 v17, s46 +; VI-NEXT: v_mov_b32_e32 v18, s45 +; VI-NEXT: v_mov_b32_e32 v19, s44 +; VI-NEXT: v_mov_b32_e32 v20, s43 +; VI-NEXT: v_mov_b32_e32 v21, s42 +; VI-NEXT: v_mov_b32_e32 v22, s41 +; VI-NEXT: v_mov_b32_e32 v23, s40 +; VI-NEXT: v_mov_b32_e32 v24, s15 +; VI-NEXT: v_mov_b32_e32 v25, s14 +; VI-NEXT: v_mov_b32_e32 v26, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v28, s11 +; VI-NEXT: v_mov_b32_e32 v29, s10 +; VI-NEXT: v_mov_b32_e32 v30, s9 +; VI-NEXT: v_mov_b32_e32 v31, s8 +; VI-NEXT: v_readlane_b32 s39, v32, 7 +; VI-NEXT: v_readlane_b32 s38, v32, 6 +; VI-NEXT: v_readlane_b32 s37, v32, 5 +; VI-NEXT: v_readlane_b32 s36, v32, 4 +; VI-NEXT: v_readlane_b32 s35, v32, 3 +; VI-NEXT: v_readlane_b32 s34, v32, 2 +; VI-NEXT: v_readlane_b32 s31, v32, 1 +; VI-NEXT: v_readlane_b32 s30, v32, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB111_4: +; VI-NEXT: s_branch .LBB111_2 +; +; GFX9-LABEL: bitcast_v64i16_to_v64f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB111_3 +; GFX9-NEXT: .LBB111_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB111_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB111_4: +; GFX9-NEXT: s_branch .LBB111_2 +; +; GFX11-LABEL: bitcast_v64i16_to_v64f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB111_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB111_4 +; GFX11-NEXT: .LBB111_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB111_3: +; GFX11-NEXT: s_branch .LBB111_2 +; GFX11-NEXT: .LBB111_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index c0577b1c1a2b5..4cf1a71470c53 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -1,28 +1,27 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <4 x float> @bitcast_v4i32_to_v4f32(<4 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i32_to_v4f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i32_to_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v4f32: ; VI: ; %bb.0: @@ -89,23 +88,124 @@ end: ret <4 x float> %phi } +define inreg <4 x float> @bitcast_v4i32_to_v4f32_scalar(<4 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i32_to_v4f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v4i32_to_v4f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v4i32_to_v4f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v4i32_to_v4f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + define <4 x i32> @bitcast_v4f32_to_v4i32(<4 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f32_to_v4i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f32_to_v4i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v4i32: ; VI: ; %bb.0: @@ -170,23 +270,127 @@ end: ret <4 x i32> %phi } +define inreg <4 x i32> @bitcast_v4f32_to_v4i32_scalar(<4 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f32_to_v4i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v4i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v4i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v4i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + define <2 x i64> @bitcast_v4i32_to_v2i64(<4 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i32_to_v2i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i32_to_v2i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v2i64: ; VI: ; %bb.0: @@ -253,23 +457,124 @@ end: ret <2 x i64> %phi } +define inreg <2 x i64> @bitcast_v4i32_to_v2i64_scalar(<4 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i32_to_v2i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v4i32_to_v2i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v4i32_to_v2i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v4i32_to_v2i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + define <4 x i32> @bitcast_v2i64_to_v4i32(<2 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i64_to_v4i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i64_to_v4i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v4i32: ; VI: ; %bb.0: @@ -337,23 +642,124 @@ end: ret <4 x i32> %phi } +define inreg <4 x i32> @bitcast_v2i64_to_v4i32_scalar(<2 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i64_to_v4i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v2i64_to_v4i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v2i64_to_v4i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v2i64_to_v4i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_3 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB7_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: s_branch .LBB7_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + define <2 x double> @bitcast_v4i32_to_v2f64(<4 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i32_to_v2f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i32_to_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v2f64: ; VI: ; %bb.0: @@ -420,21 +826,122 @@ end: ret <2 x double> %phi } +define inreg <2 x double> @bitcast_v4i32_to_v2f64_scalar(<4 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i32_to_v2f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v4i32_to_v2f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v4i32_to_v2f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v4i32_to_v2f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + define <4 x i32> @bitcast_v2f64_to_v4i32(<2 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f64_to_v4i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f64_to_v4i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v4i32: ; VI: ; %bb.0: @@ -443,11 +950,11 @@ define <4 x i32> @bitcast_v2f64_to_v4i32(<2 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -458,11 +965,11 @@ define <4 x i32> @bitcast_v2f64_to_v4i32(<2 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -474,11 +981,11 @@ define <4 x i32> @bitcast_v2f64_to_v4i32(<2 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -498,42 +1005,137 @@ end: ret <4 x i32> %phi } +define inreg <4 x i32> @bitcast_v2f64_to_v4i32_scalar(<2 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f64_to_v4i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_4 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_3: +; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v4i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v4i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v4i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + define <8 x i16> @bitcast_v4i32_to_v8i16(<4 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i32_to_v8i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, v8 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i32_to_v8i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v8i16: ; VI: ; %bb.0: @@ -600,65 +1202,185 @@ end: ret <8 x i16> %phi } +define inreg <8 x i16> @bitcast_v4i32_to_v8i16_scalar(<4 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i32_to_v8i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v4i32_to_v8i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v4i32_to_v8i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_v4i32_to_v8i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + define <4 x i32> @bitcast_v8i16_to_v4i32(<8 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i16_to_v4i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v8 -; GCN-NEXT: v_or_b32_e32 v1, v1, v11 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_or_b32_e32 v1, v11, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i16_to_v4i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v2 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v8, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v4i32: ; VI: ; %bb.0: @@ -667,7 +1389,7 @@ define <4 x i32> @bitcast_v8i16_to_v4i32(<8 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 3 ; VI-NEXT: v_add_u16_e32 v4, 3, v3 @@ -682,7 +1404,7 @@ define <4 x i32> @bitcast_v8i16_to_v4i32(<8 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v4, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v4, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -735,70 +1457,218 @@ end: ret <4 x i32> %phi } +define inreg <4 x i32> @bitcast_v8i16_to_v4i32_scalar(<8 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i16_to_v4i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v8i16_to_v4i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v8i16_to_v4i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v4i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + define <8 x half> @bitcast_v4i32_to_v8f16(<4 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i32_to_v8f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v3 -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NEXT: v_mov_b32_e32 v8, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i32_to_v8f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v8f16: ; VI: ; %bb.0: @@ -865,83 +1735,213 @@ end: ret <8 x half> %phi } +define inreg <8 x half> @bitcast_v4i32_to_v8f16_scalar(<4 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i32_to_v8f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v4i32_to_v8f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v4i32_to_v8f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v4i32_to_v8f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f16_to_v4i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v6 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v10, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v6 -; GCN-NEXT: v_or_b32_e32 v3, v4, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f16_to_v4i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v4i32: ; VI: ; %bb.0: @@ -950,7 +1950,7 @@ define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 0x200 ; VI-NEXT: v_add_f16_sdwa v5, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -965,7 +1965,7 @@ define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v5 ; VI-NEXT: v_or_b32_e32 v0, v0, v4 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1019,62 +2019,225 @@ end: ret <4 x i32> %phi } +define inreg <4 x i32> @bitcast_v8f16_to_v4i32_scalar(<8 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f16_to_v4i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s22 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v8f16_to_v4i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v4, v1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v4i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v4i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + define <8 x bfloat> @bitcast_v4i32_to_v8bf16(<4 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i32_to_v8bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v11, v3 -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v1 -; GCN-NEXT: v_mov_b32_e32 v8, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v11 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i32_to_v8bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v3 +; SI-NEXT: v_mov_b32_e32 v10, v2 +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v8bf16: ; VI: ; %bb.0: @@ -1141,75 +2304,205 @@ end: ret <8 x bfloat> %phi } +define inreg <8 x bfloat> @bitcast_v4i32_to_v8bf16_scalar(<4 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i32_to_v8bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s19, 16 +; SI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s18, 16 +; SI-NEXT: s_and_b32 s10, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s17, 16 +; SI-NEXT: s_and_b32 s12, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s19, 16 +; SI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s18, 16 +; SI-NEXT: s_and_b32 s10, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s17, 16 +; SI-NEXT: s_and_b32 s12, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s16, 16 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: v_mov_b32_e32 v1, s12 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s9 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v4i32_to_v8bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v4i32_to_v8bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v4i32_to_v8bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_3 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB21_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: s_branch .LBB21_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v8bf16_to_v4i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_4 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB11_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; GCN-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: .LBB11_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8bf16_to_v4i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_4 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB22_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: .LBB22_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8bf16_to_v4i32: ; VI: ; %bb.0: @@ -1218,7 +2511,7 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -1293,7 +2586,7 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1304,7 +2597,7 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -1368,7 +2661,7 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; GFX9-NEXT: v_perm_b32 v0, v4, v0, s7 -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1380,7 +2673,7 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -1455,7 +2748,7 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v0 -; GFX11-TRUE16-NEXT: .LBB11_2: ; %end +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1467,7 +2760,7 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1533,7 +2826,7 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v7, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB11_2: ; %end +; GFX11-FAKE16-NEXT: .LBB22_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1553,66 +2846,427 @@ end: ret <4 x i32> %phi } +define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8bf16_to_v4i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v8bf16_to_v4i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v4i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v2 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v1 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v4i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s4, s3, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s3 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v7, 16, v8 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: s_branch .LBB23_2 +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i32_to_v16i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v16, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 -; GCN-NEXT: .LBB12_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 -; GCN-NEXT: .LBB12_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i32_to_v16i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; SI-NEXT: .LBB24_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; SI-NEXT: .LBB24_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v16i8: ; VI: ; %bb.0: @@ -1636,7 +3290,7 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -1650,9 +3304,9 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB12_2: ; %Flow +; VI-NEXT: .LBB24_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_4 +; VI-NEXT: s_cbranch_execz .LBB24_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 @@ -1670,7 +3324,7 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB12_4: ; %end +; VI-NEXT: .LBB24_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v18 ; VI-NEXT: v_mov_b32_e32 v4, v19 @@ -1700,7 +3354,7 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -1714,9 +3368,9 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB12_2: ; %Flow +; GFX9-NEXT: .LBB24_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_4 +; GFX9-NEXT: s_cbranch_execz .LBB24_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 ; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 @@ -1734,7 +3388,7 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB12_4: ; %end +; GFX9-NEXT: .LBB24_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v18 ; GFX9-NEXT: v_mov_b32_e32 v4, v19 @@ -1758,7 +3412,7 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 @@ -1768,9 +3422,9 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[16:17] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB12_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB24_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 @@ -1786,7 +3440,7 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB12_4: ; %end +; GFX11-TRUE16-NEXT: .LBB24_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -1819,7 +3473,7 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -1833,9 +3487,9 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 @@ -1855,7 +3509,7 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB12_4: ; %end +; GFX11-FAKE16-NEXT: .LBB24_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 @@ -1879,127 +3533,484 @@ end: ret <16 x i8> %phi } +define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i32_to_v16i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s19, 24 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 8 +; SI-NEXT: s_lshr_b32 s9, s17, 24 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s19, 24 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 8 +; SI-NEXT: s_lshr_b32 s9, s17, 24 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 8 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v14, s7 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB25_2 +; +; VI-LABEL: bitcast_v4i32_to_v16i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s19, 24 +; VI-NEXT: s_lshr_b32 s11, s19, 16 +; VI-NEXT: s_lshr_b32 s12, s19, 8 +; VI-NEXT: s_lshr_b32 s13, s18, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s20, s17, 16 +; VI-NEXT: s_lshr_b32 s21, s17, 8 +; VI-NEXT: s_lshr_b32 s22, s16, 16 +; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s10, s19, 24 +; VI-NEXT: s_lshr_b32 s11, s19, 16 +; VI-NEXT: s_lshr_b32 s12, s19, 8 +; VI-NEXT: s_lshr_b32 s13, s18, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s20, s17, 16 +; VI-NEXT: s_lshr_b32 s21, s17, 8 +; VI-NEXT: s_lshr_b32 s22, s16, 16 +; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s23 +; VI-NEXT: v_mov_b32_e32 v2, s22 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s14 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s12 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: ; implicit-def: $sgpr23 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr21 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v4i32_to_v16i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s19, 24 +; GFX9-NEXT: s_lshr_b32 s11, s19, 16 +; GFX9-NEXT: s_lshr_b32 s12, s19, 8 +; GFX9-NEXT: s_lshr_b32 s13, s18, 16 +; GFX9-NEXT: s_lshr_b32 s14, s18, 8 +; GFX9-NEXT: s_lshr_b32 s15, s17, 24 +; GFX9-NEXT: s_lshr_b32 s20, s17, 16 +; GFX9-NEXT: s_lshr_b32 s21, s17, 8 +; GFX9-NEXT: s_lshr_b32 s22, s16, 16 +; GFX9-NEXT: s_lshr_b32 s23, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s10, s19, 24 +; GFX9-NEXT: s_lshr_b32 s11, s19, 16 +; GFX9-NEXT: s_lshr_b32 s12, s19, 8 +; GFX9-NEXT: s_lshr_b32 s13, s18, 16 +; GFX9-NEXT: s_lshr_b32 s14, s18, 8 +; GFX9-NEXT: s_lshr_b32 s15, s17, 24 +; GFX9-NEXT: s_lshr_b32 s20, s17, 16 +; GFX9-NEXT: s_lshr_b32 s21, s17, 8 +; GFX9-NEXT: s_lshr_b32 s22, s16, 16 +; GFX9-NEXT: s_lshr_b32 s23, s16, 8 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: v_mov_b32_e32 v10, s13 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: v_mov_b32_e32 v12, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s12 +; GFX9-NEXT: v_mov_b32_e32 v14, s11 +; GFX9-NEXT: v_mov_b32_e32 v15, s10 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: ; implicit-def: $sgpr23 +; GFX9-NEXT: ; implicit-def: $sgpr22 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr21 +; GFX9-NEXT: ; implicit-def: $sgpr20 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-TRUE16-LABEL: bitcast_v4i32_to_v16i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB25_3 +; GFX11-TRUE16-NEXT: .LBB25_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-TRUE16-NEXT: .LBB25_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s8 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB25_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB25_2 +; +; GFX11-FAKE16-LABEL: bitcast_v4i32_to_v16i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB25_3 +; GFX11-FAKE16-NEXT: .LBB25_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-FAKE16-NEXT: .LBB25_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v13, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s9 :: v_dual_mov_b32 v15, s8 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB25_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: s_branch .LBB25_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i8_to_v4i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v15 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_4 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB13_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v0, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v8, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v9, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: .LBB13_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v19, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v21, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v11, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x300, v8 -; GCN-NEXT: v_or_b32_e32 v7, v9, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i8_to_v4i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_4 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB26_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: .LBB26_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v4i32: ; VI: ; %bb.0: @@ -2018,14 +4029,14 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: s_cbranch_execnz .LBB26_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB13_4 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB26_4 +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB13_3: ; %cmp.false +; VI-NEXT: .LBB26_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2055,8 +4066,8 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 -; VI-NEXT: .LBB13_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: .LBB26_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v17 ; VI-NEXT: v_add_u16_e32 v1, 3, v18 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -2106,14 +4117,14 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: s_cbranch_execnz .LBB26_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB13_4 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB26_4 +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB13_3: ; %cmp.false +; GFX9-NEXT: .LBB26_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2143,8 +4154,8 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 -; GFX9-NEXT: .LBB13_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: .LBB26_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -2198,14 +4209,14 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-TRUE16-NEXT: .LBB13_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_4 +; GFX11-TRUE16-NEXT: .LBB26_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB13_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l @@ -2257,8 +4268,8 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-TRUE16-NEXT: .LBB13_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 @@ -2328,14 +4339,14 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-FAKE16-NEXT: .LBB13_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_4 +; GFX11-FAKE16-NEXT: .LBB26_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB13_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -2382,8 +4393,8 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-FAKE16-NEXT: .LBB13_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v17, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v18, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -2448,23 +4459,472 @@ end: ret <4 x i32> %phi } +define inreg <4 x i32> @bitcast_v16i8_to_v4i32_scalar(<16 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i8_to_v4i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v1 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v3, s7, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v16i8_to_v4i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_mov_b32_e32 v5, v1 +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s7, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s19, 24 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v4 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s28, 0xff +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v16i8_to_v4i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v16i8_to_v4i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s21, 8 +; GFX11-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-NEXT: s_or_b32 s9, s9, s10 +; GFX11-NEXT: s_or_b32 s10, s11, s12 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_3 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s21, 8 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s23, 8 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s25, 8 +; GFX11-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s7, s6 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s6, s0, s1 +; GFX11-NEXT: s_or_b32 s7, s2, s3 +; GFX11-NEXT: .LBB27_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB27_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + define <2 x i64> @bitcast_v4f32_to_v2i64(<4 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f32_to_v2i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB14_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f32_to_v2i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v2i64: ; VI: ; %bb.0: @@ -2529,23 +4989,127 @@ end: ret <2 x i64> %phi } +define inreg <2 x i64> @bitcast_v4f32_to_v2i64_scalar(<4 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f32_to_v2i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB29_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB29_4 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_3: +; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v2i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v2i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v2i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + define <4 x float> @bitcast_v2i64_to_v4f32(<2 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i64_to_v4f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i64_to_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v4f32: ; VI: ; %bb.0: @@ -2613,23 +5177,124 @@ end: ret <4 x float> %phi } +define inreg <4 x float> @bitcast_v2i64_to_v4f32_scalar(<2 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i64_to_v4f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v2i64_to_v4f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v2i64_to_v4f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-LABEL: bitcast_v2i64_to_v4f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB31_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: s_branch .LBB31_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + define <2 x double> @bitcast_v4f32_to_v2f64(<4 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f32_to_v2f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f32_to_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v2f64: ; VI: ; %bb.0: @@ -2694,21 +5359,125 @@ end: ret <2 x double> %phi } +define inreg <2 x double> @bitcast_v4f32_to_v2f64_scalar(<4 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f32_to_v2f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB33_4 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_3: +; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v2f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_3: +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v2f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v2f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + define <4 x float> @bitcast_v2f64_to_v4f32(<2 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f64_to_v4f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f64_to_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v4f32: ; VI: ; %bb.0: @@ -2717,11 +5486,11 @@ define <4 x float> @bitcast_v2f64_to_v4f32(<2 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2732,28 +5501,125 @@ define <4 x float> @bitcast_v2f64_to_v4f32(<2 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: .LBB34_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v2f64_to_v4f32: +; GFX11-LABEL: bitcast_v2f64_to_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB34_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + +define inreg <4 x float> @bitcast_v2f64_to_v4f32_scalar(<2 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f64_to_v4f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB35_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB35_4 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_3: +; SI-NEXT: s_branch .LBB35_2 +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v4f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB35_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: +; VI-NEXT: s_branch .LBB35_2 +; VI-NEXT: .LBB35_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v4f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: +; GFX9-NEXT: s_branch .LBB35_2 +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v4f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB17_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2773,41 +5639,39 @@ end: } define <8 x i16> @bitcast_v4f32_to_v8i16(<4 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f32_to_v8i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB18_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB18_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, v8 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f32_to_v8i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v8i16: ; VI: ; %bb.0: @@ -2872,65 +5736,186 @@ end: ret <8 x i16> %phi } +define inreg <8 x i16> @bitcast_v4f32_to_v8i16_scalar(<4 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f32_to_v8i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB37_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB37_4 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: s_branch .LBB37_2 +; SI-NEXT: .LBB37_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v8i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB37_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_4 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_3: +; VI-NEXT: s_branch .LBB37_2 +; VI-NEXT: .LBB37_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v8i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_4 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_3: +; GFX9-NEXT: s_branch .LBB37_2 +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v8i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: s_branch .LBB37_2 +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + define <4 x float> @bitcast_v8i16_to_v4f32(<8 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i16_to_v4f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_4 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB19_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v8 -; GCN-NEXT: v_or_b32_e32 v1, v1, v11 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: .LBB19_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_or_b32_e32 v1, v11, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i16_to_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v2 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB38_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB38_4 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB38_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: .LBB38_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v8, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v4f32: ; VI: ; %bb.0: @@ -2939,7 +5924,7 @@ define <4 x float> @bitcast_v8i16_to_v4f32(<8 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 3 ; VI-NEXT: v_add_u16_e32 v4, 3, v3 @@ -2954,7 +5939,7 @@ define <4 x float> @bitcast_v8i16_to_v4f32(<8 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v4, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v4, v0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3007,70 +5992,218 @@ end: ret <4 x float> %phi } +define inreg <4 x float> @bitcast_v8i16_to_v4f32_scalar(<8 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i16_to_v4f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v8i16_to_v4f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v8i16_to_v4f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_4 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_3: +; GFX9-NEXT: s_branch .LBB39_2 +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v4f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: s_branch .LBB39_2 +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + define <8 x half> @bitcast_v4f32_to_v8f16(<4 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f32_to_v8f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v3 -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NEXT: v_mov_b32_e32 v8, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB20_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB20_4 -; GCN-NEXT: .LBB20_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB20_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: .LBB20_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f32_to_v8f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB40_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB40_4 +; SI-NEXT: .LBB40_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB40_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: .LBB40_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v8f16: ; VI: ; %bb.0: @@ -3135,83 +6268,215 @@ end: ret <8 x half> %phi } +define inreg <8 x half> @bitcast_v4f32_to_v8f16_scalar(<4 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f32_to_v8f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v4f32_to_v8f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB41_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_4 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_3: +; VI-NEXT: s_branch .LBB41_2 +; VI-NEXT: .LBB41_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v8f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_4 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_3: +; GFX9-NEXT: s_branch .LBB41_2 +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v8f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_4 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_3: +; GFX11-NEXT: s_branch .LBB41_2 +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f16_to_v4f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v6 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_4 -; GCN-NEXT: .LBB21_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB21_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v10, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: .LBB21_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v6 -; GCN-NEXT: v_or_b32_e32 v3, v4, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f16_to_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_4 +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB42_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: .LBB42_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v4f32: ; VI: ; %bb.0: @@ -3220,7 +6485,7 @@ define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 0x200 ; VI-NEXT: v_add_f16_sdwa v5, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3235,7 +6500,7 @@ define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v5 ; VI-NEXT: v_or_b32_e32 v0, v0, v4 -; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: .LBB42_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3289,62 +6554,225 @@ end: ret <4 x float> %phi } +define inreg <4 x float> @bitcast_v8f16_to_v4f32_scalar(<8 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f16_to_v4f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s22 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v8f16_to_v4f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB43_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_4 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v4, v1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_3: +; VI-NEXT: s_branch .LBB43_2 +; VI-NEXT: .LBB43_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v4f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_4 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_3: +; GFX9-NEXT: s_branch .LBB43_2 +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v4f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: +; GFX11-NEXT: s_branch .LBB43_2 +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + define <8 x bfloat> @bitcast_v4f32_to_v8bf16(<4 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f32_to_v8bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v11, v3 -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v1 -; GCN-NEXT: v_mov_b32_e32 v8, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_4 -; GCN-NEXT: .LBB22_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB22_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: .LBB22_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v11 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f32_to_v8bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v3 +; SI-NEXT: v_mov_b32_e32 v10, v2 +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_4 +; SI-NEXT: .LBB44_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB44_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: .LBB44_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v8bf16: ; VI: ; %bb.0: @@ -3409,75 +6837,208 @@ end: ret <8 x bfloat> %phi } +define inreg <8 x bfloat> @bitcast_v4f32_to_v8bf16_scalar(<4 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f32_to_v8bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB45_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s19, 16 +; SI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s18, 16 +; SI-NEXT: s_and_b32 s10, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s17, 16 +; SI-NEXT: s_and_b32 s12, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB45_4 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_3: +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB45_2 +; SI-NEXT: .LBB45_4: +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: v_mov_b32_e32 v1, s12 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s9 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v8bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB45_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_4 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_3: +; VI-NEXT: s_branch .LBB45_2 +; VI-NEXT: .LBB45_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v8bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_4 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_3: +; GFX9-NEXT: s_branch .LBB45_2 +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v8bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_4 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_3: +; GFX11-NEXT: s_branch .LBB45_2 +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v8bf16_to_v4f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_4 -; GCN-NEXT: .LBB23_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB23_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; GCN-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: .LBB23_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8bf16_to_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_4 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB46_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: .LBB46_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8bf16_to_v4f32: ; VI: ; %bb.0: @@ -3486,7 +7047,7 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -3561,7 +7122,7 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3572,7 +7133,7 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -3636,7 +7197,7 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; GFX9-NEXT: v_perm_b32 v0, v4, v0, s7 -; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: .LBB46_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3648,7 +7209,7 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -3723,7 +7284,7 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v0 -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3735,7 +7296,7 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3801,7 +7362,7 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v7, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3821,66 +7382,427 @@ end: ret <4 x float> %phi } +define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8bf16_to_v4f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v8bf16_to_v4f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v4f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v2 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v1 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v4f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s4, s3, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s3 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v7, 16, v8 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_3: +; GFX11-NEXT: s_branch .LBB47_2 +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f32_to_v16i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v16, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f32_to_v16i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v16i8: ; VI: ; %bb.0: @@ -3904,7 +7826,7 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -3918,9 +7840,9 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -3938,7 +7860,7 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v18 ; VI-NEXT: v_mov_b32_e32 v4, v19 @@ -3968,7 +7890,7 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -3982,9 +7904,9 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -4002,7 +7924,7 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v18 ; GFX9-NEXT: v_mov_b32_e32 v4, v19 @@ -4026,7 +7948,7 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 @@ -4036,9 +7958,9 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[16:17] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v16, 1.0, v16 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v10, 1.0, v10 @@ -4052,7 +7974,7 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB24_4: ; %end +; GFX11-TRUE16-NEXT: .LBB48_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -4085,7 +8007,7 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -4099,9 +8021,9 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v16, 1.0, v16 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v18, 1.0, v18 @@ -4119,7 +8041,7 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 @@ -4143,127 +8065,513 @@ end: ret <16 x i8> %phi } +define inreg <16 x i8> @bitcast_v4f32_to_v16i8_scalar(<4 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f32_to_v16i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB49_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s9, s19, 24 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB49_4 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v4, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s18, 1.0 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: s_branch .LBB49_2 +; SI-NEXT: .LBB49_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v13, s11 +; SI-NEXT: v_mov_b32_e32 v14, s10 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v16i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB49_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s19, 24 +; VI-NEXT: s_lshr_b32 s11, s19, 16 +; VI-NEXT: s_lshr_b32 s13, s19, 8 +; VI-NEXT: s_lshr_b32 s12, s18, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s20, s17, 16 +; VI-NEXT: s_lshr_b32 s22, s17, 8 +; VI-NEXT: s_lshr_b32 s21, s16, 16 +; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB49_4 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v19, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v18, s16, 1.0 +; VI-NEXT: v_add_f32_e64 v17, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v16, s18, 1.0 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: s_branch .LBB49_5 +; VI-NEXT: .LBB49_3: +; VI-NEXT: ; implicit-def: $sgpr23 +; VI-NEXT: ; implicit-def: $sgpr21 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB49_2 +; VI-NEXT: .LBB49_4: +; VI-NEXT: v_mov_b32_e32 v18, s16 +; VI-NEXT: v_mov_b32_e32 v19, s17 +; VI-NEXT: v_mov_b32_e32 v16, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v1, s23 +; VI-NEXT: v_mov_b32_e32 v2, s21 +; VI-NEXT: v_mov_b32_e32 v5, s22 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v9, s14 +; VI-NEXT: v_mov_b32_e32 v10, s12 +; VI-NEXT: v_mov_b32_e32 v13, s13 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB49_5: ; %end +; VI-NEXT: v_mov_b32_e32 v0, v18 +; VI-NEXT: v_mov_b32_e32 v4, v19 +; VI-NEXT: v_mov_b32_e32 v8, v16 +; VI-NEXT: v_mov_b32_e32 v12, v17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v16i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s19, 24 +; GFX9-NEXT: s_lshr_b32 s11, s19, 16 +; GFX9-NEXT: s_lshr_b32 s13, s19, 8 +; GFX9-NEXT: s_lshr_b32 s12, s18, 16 +; GFX9-NEXT: s_lshr_b32 s14, s18, 8 +; GFX9-NEXT: s_lshr_b32 s15, s17, 24 +; GFX9-NEXT: s_lshr_b32 s20, s17, 16 +; GFX9-NEXT: s_lshr_b32 s22, s17, 8 +; GFX9-NEXT: s_lshr_b32 s21, s16, 16 +; GFX9-NEXT: s_lshr_b32 s23, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v19, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v18, s16, 1.0 +; GFX9-NEXT: v_add_f32_e64 v17, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v16, s18, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: s_branch .LBB49_5 +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: ; implicit-def: $sgpr23 +; GFX9-NEXT: ; implicit-def: $sgpr21 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr22 +; GFX9-NEXT: ; implicit-def: $sgpr20 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v16, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s21 +; GFX9-NEXT: v_mov_b32_e32 v5, s22 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: v_mov_b32_e32 v10, s12 +; GFX9-NEXT: v_mov_b32_e32 v13, s13 +; GFX9-NEXT: v_mov_b32_e32 v14, s11 +; GFX9-NEXT: v_mov_b32_e32 v15, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB49_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4f32_to_v16i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, s3, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, s2, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s0, 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB49_5 +; GFX11-TRUE16-NEXT: .LBB49_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: .LBB49_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4f32_to_v16i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, s3, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, s2, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, s0, 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-FAKE16-NEXT: s_branch .LBB49_5 +; GFX11-FAKE16-NEXT: .LBB49_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr18 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v19, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v17, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s18 :: v_dual_mov_b32 v2, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s14 :: v_dual_mov_b32 v10, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v14, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, s12 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s9 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB49_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v16 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v17 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i8_to_v4f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v15 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_4 -; GCN-NEXT: .LBB25_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB25_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v0, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v8, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v9, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: .LBB25_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v19, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v21, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v11, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x300, v8 -; GCN-NEXT: v_or_b32_e32 v7, v9, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i8_to_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_4 +; SI-NEXT: .LBB50_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB50_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: .LBB50_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v4f32: ; VI: ; %bb.0: @@ -4282,14 +8590,14 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: s_cbranch_execnz .LBB50_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB25_4 -; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB50_4 +; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB25_3: ; %cmp.false +; VI-NEXT: .LBB50_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4319,8 +8627,8 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 -; VI-NEXT: .LBB25_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: .LBB50_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v17 ; VI-NEXT: v_add_u16_e32 v1, 3, v18 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -4370,14 +8678,14 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: s_cbranch_execnz .LBB50_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB25_4 -; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB50_4 +; GFX9-NEXT: .LBB50_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB25_3: ; %cmp.false +; GFX9-NEXT: .LBB50_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4407,8 +8715,8 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 -; GFX9-NEXT: .LBB25_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB50_2 +; GFX9-NEXT: .LBB50_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -4462,14 +8770,14 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_4 -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB25_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l @@ -4521,8 +8829,8 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-TRUE16-NEXT: .LBB25_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 @@ -4592,14 +8900,14 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_4 -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB25_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -4646,8 +8954,8 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-FAKE16-NEXT: .LBB25_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-FAKE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v17, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v18, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -4712,23 +9020,472 @@ end: ret <4 x float> %phi } +define inreg <4 x float> @bitcast_v16i8_to_v4f32_scalar(<16 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i8_to_v4f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v1 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v3, s7, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v16i8_to_v4f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_mov_b32_e32 v5, v1 +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s7, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s19, 24 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v4 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s28, 0xff +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v16i8_to_v4f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-LABEL: bitcast_v16i8_to_v4f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s21, 8 +; GFX11-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-NEXT: s_or_b32 s9, s9, s10 +; GFX11-NEXT: s_or_b32 s10, s11, s12 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-NEXT: .LBB51_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s21, 8 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s23, 8 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s25, 8 +; GFX11-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s7, s6 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s6, s0, s1 +; GFX11-NEXT: s_or_b32 s7, s2, s3 +; GFX11-NEXT: .LBB51_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB51_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + define <2 x double> @bitcast_v2i64_to_v2f64(<2 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i64_to_v2f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: .LBB26_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i64_to_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v2f64: ; VI: ; %bb.0: @@ -4796,21 +9553,121 @@ end: ret <2 x double> %phi } +define inreg <2 x double> @bitcast_v2i64_to_v2f64_scalar(<2 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i64_to_v2f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v2i64_to_v2f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v2i64_to_v2f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-LABEL: bitcast_v2i64_to_v2f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: .LBB53_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + define <2 x i64> @bitcast_v2f64_to_v2i64(<2 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f64_to_v2i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f64_to_v2i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v2i64: ; VI: ; %bb.0: @@ -4819,11 +9676,11 @@ define <2 x i64> @bitcast_v2f64_to_v2i64(<2 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4834,11 +9691,11 @@ define <2 x i64> @bitcast_v2f64_to_v2i64(<2 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: .LBB27_2: ; %end +; GFX9-NEXT: .LBB54_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4850,11 +9707,11 @@ define <2 x i64> @bitcast_v2f64_to_v2i64(<2 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: .LBB27_2: ; %end +; GFX11-NEXT: .LBB54_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4874,42 +9731,137 @@ end: ret <2 x i64> %phi } +define inreg <2 x i64> @bitcast_v2f64_to_v2i64_scalar(<2 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f64_to_v2i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB55_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB55_4 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_3: +; SI-NEXT: s_branch .LBB55_2 +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v2i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB55_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_4 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_3: +; VI-NEXT: s_branch .LBB55_2 +; VI-NEXT: .LBB55_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v2i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: +; GFX9-NEXT: s_branch .LBB55_2 +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v2i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + define <8 x i16> @bitcast_v2i64_to_v8i16(<2 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i64_to_v8i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, v8 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i64_to_v8i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v8i16: ; VI: ; %bb.0: @@ -4977,65 +9929,185 @@ end: ret <8 x i16> %phi } +define inreg <8 x i16> @bitcast_v2i64_to_v8i16_scalar(<2 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i64_to_v8i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v2i64_to_v8i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v2i64_to_v8i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_3 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB57_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: s_branch .LBB57_2 +; +; GFX11-LABEL: bitcast_v2i64_to_v8i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_3 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB57_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: s_branch .LBB57_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + define <2 x i64> @bitcast_v8i16_to_v2i64(<8 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i16_to_v2i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_4 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB29_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v8 -; GCN-NEXT: v_or_b32_e32 v1, v1, v11 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: .LBB29_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_or_b32_e32 v1, v11, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i16_to_v2i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v2 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_4 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB58_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: .LBB58_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v8, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v2i64: ; VI: ; %bb.0: @@ -5044,7 +10116,7 @@ define <2 x i64> @bitcast_v8i16_to_v2i64(<8 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 3 ; VI-NEXT: v_add_u16_e32 v4, 3, v3 @@ -5059,7 +10131,7 @@ define <2 x i64> @bitcast_v8i16_to_v2i64(<8 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v4, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v4, v0 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5112,70 +10184,218 @@ end: ret <2 x i64> %phi } +define inreg <2 x i64> @bitcast_v8i16_to_v2i64_scalar(<8 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i16_to_v2i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v8i16_to_v2i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB59_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_3 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB59_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_4: +; VI-NEXT: s_branch .LBB59_2 +; +; GFX9-LABEL: bitcast_v8i16_to_v2i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v2i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_3: +; GFX11-NEXT: s_branch .LBB59_2 +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + define <8 x half> @bitcast_v2i64_to_v8f16(<2 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i64_to_v8f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v3 -; GCN-NEXT: v_mov_b32_e32 v9, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NEXT: v_mov_b32_e32 v8, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB30_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB30_4 -; GCN-NEXT: .LBB30_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB30_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB30_2 -; GCN-NEXT: .LBB30_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i64_to_v8f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_4 +; SI-NEXT: .LBB60_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB60_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: .LBB60_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v8f16: ; VI: ; %bb.0: @@ -5243,83 +10463,213 @@ end: ret <8 x half> %phi } +define inreg <8 x half> @bitcast_v2i64_to_v8f16_scalar(<2 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i64_to_v8f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB61_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB61_3 +; SI-NEXT: .LBB61_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_add_u32 s8, s18, 3 +; SI-NEXT: s_addc_u32 s9, s19, 0 +; SI-NEXT: s_lshr_b32 s10, s8, 16 +; SI-NEXT: s_lshr_b32 s11, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB61_2 +; +; VI-LABEL: bitcast_v2i64_to_v8f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB61_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB61_3 +; VI-NEXT: .LBB61_2: ; %cmp.true +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB61_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB61_4: +; VI-NEXT: s_branch .LBB61_2 +; +; GFX9-LABEL: bitcast_v2i64_to_v8f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB61_3 +; GFX9-NEXT: .LBB61_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB61_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB61_4: +; GFX9-NEXT: s_branch .LBB61_2 +; +; GFX11-LABEL: bitcast_v2i64_to_v8f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB61_3 +; GFX11-NEXT: .LBB61_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB61_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB61_4: +; GFX11-NEXT: s_branch .LBB61_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f16_to_v2i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v6 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB31_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB31_4 -; GCN-NEXT: .LBB31_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB31_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v10, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB31_2 -; GCN-NEXT: .LBB31_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v6 -; GCN-NEXT: v_or_b32_e32 v3, v4, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f16_to_v2i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB62_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB62_4 +; SI-NEXT: .LBB62_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB62_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_2 +; SI-NEXT: .LBB62_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v2i64: ; VI: ; %bb.0: @@ -5328,7 +10678,7 @@ define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB31_2 +; VI-NEXT: s_cbranch_execz .LBB62_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 0x200 ; VI-NEXT: v_add_f16_sdwa v5, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -5343,7 +10693,7 @@ define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v5 ; VI-NEXT: v_or_b32_e32 v0, v0, v4 -; VI-NEXT: .LBB31_2: ; %end +; VI-NEXT: .LBB62_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5397,62 +10747,225 @@ end: ret <2 x i64> %phi } +define inreg <2 x i64> @bitcast_v8f16_to_v2i64_scalar(<8 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f16_to_v2i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s22 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: .LBB63_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB63_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB63_2 +; +; VI-LABEL: bitcast_v8f16_to_v2i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB63_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB63_4 +; VI-NEXT: .LBB63_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v4, v1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB63_3: +; VI-NEXT: s_branch .LBB63_2 +; VI-NEXT: .LBB63_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v2i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB63_4 +; GFX9-NEXT: .LBB63_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB63_3: +; GFX9-NEXT: s_branch .LBB63_2 +; GFX9-NEXT: .LBB63_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v2i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB63_4 +; GFX11-NEXT: .LBB63_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB63_3: +; GFX11-NEXT: s_branch .LBB63_2 +; GFX11-NEXT: .LBB63_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + define <8 x bfloat> @bitcast_v2i64_to_v8bf16(<2 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i64_to_v8bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v11, v3 -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v1 -; GCN-NEXT: v_mov_b32_e32 v8, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_4 -; GCN-NEXT: .LBB32_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB32_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB32_2 -; GCN-NEXT: .LBB32_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i64_to_v8bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v3 +; SI-NEXT: v_mov_b32_e32 v10, v2 +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_4 +; SI-NEXT: .LBB64_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB64_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_2 +; SI-NEXT: .LBB64_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v8bf16: ; VI: ; %bb.0: @@ -5520,75 +11033,205 @@ end: ret <8 x bfloat> %phi } +define inreg <8 x bfloat> @bitcast_v2i64_to_v8bf16_scalar(<2 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i64_to_v8bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s19, 16 +; SI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s18, 16 +; SI-NEXT: s_and_b32 s10, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s17, 16 +; SI-NEXT: s_and_b32 s12, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: .LBB65_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_add_u32 s9, s18, 3 +; SI-NEXT: s_addc_u32 s7, s19, 0 +; SI-NEXT: s_and_b32 s6, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s8, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s10, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s5, 16 +; SI-NEXT: s_and_b32 s12, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s4, 16 +; SI-NEXT: .LBB65_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: v_mov_b32_e32 v1, s12 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s9 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB65_4: +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB65_2 +; +; VI-LABEL: bitcast_v2i64_to_v8bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB65_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB65_3 +; VI-NEXT: .LBB65_2: ; %cmp.true +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB65_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB65_4: +; VI-NEXT: s_branch .LBB65_2 +; +; GFX9-LABEL: bitcast_v2i64_to_v8bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB65_3 +; GFX9-NEXT: .LBB65_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB65_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB65_4: +; GFX9-NEXT: s_branch .LBB65_2 +; +; GFX11-LABEL: bitcast_v2i64_to_v8bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB65_3 +; GFX11-NEXT: .LBB65_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB65_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB65_4: +; GFX11-NEXT: s_branch .LBB65_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v8bf16_to_v2i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB33_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB33_4 -; GCN-NEXT: .LBB33_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB33_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; GCN-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB33_2 -; GCN-NEXT: .LBB33_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8bf16_to_v2i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB66_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB66_4 +; SI-NEXT: .LBB66_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB66_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_2 +; SI-NEXT: .LBB66_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8bf16_to_v2i64: ; VI: ; %bb.0: @@ -5597,7 +11240,7 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB33_2 +; VI-NEXT: s_cbranch_execz .LBB66_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -5672,7 +11315,7 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; VI-NEXT: .LBB33_2: ; %end +; VI-NEXT: .LBB66_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5683,7 +11326,7 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB33_2 +; GFX9-NEXT: s_cbranch_execz .LBB66_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -5747,7 +11390,7 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; GFX9-NEXT: v_perm_b32 v0, v4, v0, s7 -; GFX9-NEXT: .LBB33_2: ; %end +; GFX9-NEXT: .LBB66_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5759,7 +11402,7 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -5834,7 +11477,7 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v0 -; GFX11-TRUE16-NEXT: .LBB33_2: ; %end +; GFX11-TRUE16-NEXT: .LBB66_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5846,7 +11489,7 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5912,7 +11555,7 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v7, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB33_2: ; %end +; GFX11-FAKE16-NEXT: .LBB66_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5932,66 +11575,427 @@ end: ret <2 x i64> %phi } +define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8bf16_to_v2i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: s_cbranch_scc0 .LBB67_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: s_cbranch_execnz .LBB67_3 +; SI-NEXT: .LBB67_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB67_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB67_2 +; +; VI-LABEL: bitcast_v8bf16_to_v2i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB67_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB67_4 +; VI-NEXT: .LBB67_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB67_3: +; VI-NEXT: s_branch .LBB67_2 +; VI-NEXT: .LBB67_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v2i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB67_4 +; GFX9-NEXT: .LBB67_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v2 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v1 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB67_3: +; GFX9-NEXT: s_branch .LBB67_2 +; GFX9-NEXT: .LBB67_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v2i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB67_4 +; GFX11-NEXT: .LBB67_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s4, s3, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s3 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v7, 16, v8 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB67_3: +; GFX11-NEXT: s_branch .LBB67_2 +; GFX11-NEXT: .LBB67_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i64_to_v16i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v16, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 -; GCN-NEXT: .LBB34_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 -; GCN-NEXT: .LBB34_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i64_to_v16i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; SI-NEXT: .LBB68_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; SI-NEXT: .LBB68_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v16i8: ; VI: ; %bb.0: @@ -6015,7 +12019,7 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: s_cbranch_execz .LBB68_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -6029,9 +12033,9 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB34_2: ; %Flow +; VI-NEXT: .LBB68_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB34_4 +; VI-NEXT: s_cbranch_execz .LBB68_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc @@ -6049,7 +12053,7 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB34_4: ; %end +; VI-NEXT: .LBB68_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v18 ; VI-NEXT: v_mov_b32_e32 v4, v19 @@ -6079,7 +12083,7 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB34_2 +; GFX9-NEXT: s_cbranch_execz .LBB68_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -6093,9 +12097,9 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB34_2: ; %Flow +; GFX9-NEXT: .LBB68_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB34_4 +; GFX9-NEXT: s_cbranch_execz .LBB68_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc @@ -6113,7 +12117,7 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB34_4: ; %end +; GFX9-NEXT: .LBB68_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v18 ; GFX9-NEXT: v_mov_b32_e32 v4, v19 @@ -6137,7 +12141,7 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB68_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 @@ -6147,9 +12151,9 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[16:17] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB34_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB68_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB68_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -6165,7 +12169,7 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB34_4: ; %end +; GFX11-TRUE16-NEXT: .LBB68_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -6198,7 +12202,7 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB68_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -6212,9 +12216,9 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB34_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB68_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB68_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -6234,7 +12238,7 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB34_4: ; %end +; GFX11-FAKE16-NEXT: .LBB68_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 @@ -6258,127 +12262,484 @@ end: ret <16 x i8> %phi } +define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i64_to_v16i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s19, 24 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 8 +; SI-NEXT: s_lshr_b32 s9, s17, 24 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: .LBB69_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s19, 24 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 8 +; SI-NEXT: s_lshr_b32 s9, s17, 24 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 8 +; SI-NEXT: .LBB69_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v14, s7 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB69_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB69_2 +; +; VI-LABEL: bitcast_v2i64_to_v16i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB69_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s19, 24 +; VI-NEXT: s_lshr_b32 s11, s19, 16 +; VI-NEXT: s_lshr_b32 s12, s19, 8 +; VI-NEXT: s_lshr_b32 s13, s18, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s20, s17, 16 +; VI-NEXT: s_lshr_b32 s21, s17, 8 +; VI-NEXT: s_lshr_b32 s22, s16, 16 +; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB69_3 +; VI-NEXT: .LBB69_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s10, s19, 24 +; VI-NEXT: s_lshr_b32 s11, s19, 16 +; VI-NEXT: s_lshr_b32 s12, s19, 8 +; VI-NEXT: s_lshr_b32 s13, s18, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s20, s17, 16 +; VI-NEXT: s_lshr_b32 s21, s17, 8 +; VI-NEXT: s_lshr_b32 s22, s16, 16 +; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: .LBB69_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s23 +; VI-NEXT: v_mov_b32_e32 v2, s22 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s14 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s12 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB69_4: +; VI-NEXT: ; implicit-def: $sgpr23 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr21 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB69_2 +; +; GFX9-LABEL: bitcast_v2i64_to_v16i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s19, 24 +; GFX9-NEXT: s_lshr_b32 s11, s19, 16 +; GFX9-NEXT: s_lshr_b32 s12, s19, 8 +; GFX9-NEXT: s_lshr_b32 s13, s18, 16 +; GFX9-NEXT: s_lshr_b32 s14, s18, 8 +; GFX9-NEXT: s_lshr_b32 s15, s17, 24 +; GFX9-NEXT: s_lshr_b32 s20, s17, 16 +; GFX9-NEXT: s_lshr_b32 s21, s17, 8 +; GFX9-NEXT: s_lshr_b32 s22, s16, 16 +; GFX9-NEXT: s_lshr_b32 s23, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB69_3 +; GFX9-NEXT: .LBB69_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s10, s19, 24 +; GFX9-NEXT: s_lshr_b32 s11, s19, 16 +; GFX9-NEXT: s_lshr_b32 s12, s19, 8 +; GFX9-NEXT: s_lshr_b32 s13, s18, 16 +; GFX9-NEXT: s_lshr_b32 s14, s18, 8 +; GFX9-NEXT: s_lshr_b32 s15, s17, 24 +; GFX9-NEXT: s_lshr_b32 s20, s17, 16 +; GFX9-NEXT: s_lshr_b32 s21, s17, 8 +; GFX9-NEXT: s_lshr_b32 s22, s16, 16 +; GFX9-NEXT: s_lshr_b32 s23, s16, 8 +; GFX9-NEXT: .LBB69_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: v_mov_b32_e32 v10, s13 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: v_mov_b32_e32 v12, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s12 +; GFX9-NEXT: v_mov_b32_e32 v14, s11 +; GFX9-NEXT: v_mov_b32_e32 v15, s10 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB69_4: +; GFX9-NEXT: ; implicit-def: $sgpr23 +; GFX9-NEXT: ; implicit-def: $sgpr22 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr21 +; GFX9-NEXT: ; implicit-def: $sgpr20 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB69_2 +; +; GFX11-TRUE16-LABEL: bitcast_v2i64_to_v16i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB69_3 +; GFX11-TRUE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-TRUE16-NEXT: .LBB69_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s8 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB69_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB69_2 +; +; GFX11-FAKE16-LABEL: bitcast_v2i64_to_v16i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB69_3 +; GFX11-FAKE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-FAKE16-NEXT: .LBB69_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v13, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s9 :: v_dual_mov_b32 v15, s8 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB69_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: s_branch .LBB69_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i8_to_v2i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v15 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB35_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB35_4 -; GCN-NEXT: .LBB35_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB35_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v0, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v8, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v9, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB35_2 -; GCN-NEXT: .LBB35_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v19, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v21, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v11, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x300, v8 -; GCN-NEXT: v_or_b32_e32 v7, v9, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i8_to_v2i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_4 +; SI-NEXT: .LBB70_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB70_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_2 +; SI-NEXT: .LBB70_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v2i64: ; VI: ; %bb.0: @@ -6397,14 +12758,14 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: s_cbranch_execnz .LBB70_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB35_4 -; VI-NEXT: .LBB35_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB70_4 +; VI-NEXT: .LBB70_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB35_3: ; %cmp.false +; VI-NEXT: .LBB70_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -6434,8 +12795,8 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB35_2 -; VI-NEXT: .LBB35_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB70_2 +; VI-NEXT: .LBB70_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v17 ; VI-NEXT: v_add_u16_e32 v1, 3, v18 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -6485,14 +12846,14 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: s_cbranch_execnz .LBB70_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB35_4 -; GFX9-NEXT: .LBB35_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB70_4 +; GFX9-NEXT: .LBB70_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB35_3: ; %cmp.false +; GFX9-NEXT: .LBB70_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -6522,8 +12883,8 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB35_2 -; GFX9-NEXT: .LBB35_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB70_2 +; GFX9-NEXT: .LBB70_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -6577,14 +12938,14 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-TRUE16-NEXT: .LBB35_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-TRUE16-NEXT: .LBB70_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l @@ -6636,8 +12997,8 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-TRUE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 @@ -6707,14 +13068,14 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-FAKE16-NEXT: .LBB35_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-FAKE16-NEXT: .LBB70_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -6761,8 +13122,8 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-FAKE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-FAKE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v17, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v18, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -6827,44 +13188,492 @@ end: ret <2 x i64> %phi } +define inreg <2 x i64> @bitcast_v16i8_to_v2i64_scalar(<16 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i8_to_v2i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v1 +; SI-NEXT: s_cbranch_scc0 .LBB71_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v3, s7, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB71_3 +; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB71_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB71_2 +; +; VI-LABEL: bitcast_v16i8_to_v2i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_mov_b32_e32 v5, v1 +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB71_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s7, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB71_3 +; VI-NEXT: .LBB71_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s19, 24 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v4 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s28, 0xff +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB71_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB71_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_branch .LBB71_2 +; +; GFX9-LABEL: bitcast_v16i8_to_v2i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB71_3 +; GFX9-NEXT: .LBB71_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: .LBB71_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB71_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB71_2 +; +; GFX11-LABEL: bitcast_v16i8_to_v2i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s21, 8 +; GFX11-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-NEXT: s_or_b32 s9, s9, s10 +; GFX11-NEXT: s_or_b32 s10, s11, s12 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-NEXT: .LBB71_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s21, 8 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s23, 8 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s25, 8 +; GFX11-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s7, s6 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s6, s0, s1 +; GFX11-NEXT: s_or_b32 s7, s2, s3 +; GFX11-NEXT: .LBB71_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB71_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB71_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + define <8 x i16> @bitcast_v2f64_to_v8i16(<2 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f64_to_v8i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v3 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NEXT: v_mov_b32_e32 v10, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v11, v10, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-NEXT: .LBB36_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v11, v10, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-NEXT: .LBB36_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v10 -; GCN-NEXT: v_mov_b32_e32 v2, v11 -; GCN-NEXT: v_mov_b32_e32 v4, v8 -; GCN-NEXT: v_mov_b32_e32 v6, v9 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f64_to_v8i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v5, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v11, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_alignbit_b32 v5, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v11, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v10 +; SI-NEXT: v_mov_b32_e32 v2, v11 +; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: v_mov_b32_e32 v6, v9 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v8i16: ; VI: ; %bb.0: @@ -6873,11 +13682,11 @@ define <8 x i16> @bitcast_v2f64_to_v8i16(<2 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: s_cbranch_execz .LBB72_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: .LBB72_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6888,11 +13697,11 @@ define <8 x i16> @bitcast_v2f64_to_v8i16(<2 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: s_cbranch_execz .LBB72_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: .LBB72_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6904,11 +13713,11 @@ define <8 x i16> @bitcast_v2f64_to_v8i16(<2 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: .LBB72_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6928,65 +13737,183 @@ end: ret <8 x i16> %phi } +define inreg <8 x i16> @bitcast_v2f64_to_v8i16_scalar(<2 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f64_to_v8i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB73_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB73_4 +; SI-NEXT: .LBB73_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[16:17], 1.0 +; SI-NEXT: v_alignbit_b32 v5, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v11, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: s_branch .LBB73_5 +; SI-NEXT: .LBB73_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: s_branch .LBB73_2 +; SI-NEXT: .LBB73_4: +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: .LBB73_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v10 +; SI-NEXT: v_mov_b32_e32 v2, v11 +; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: v_mov_b32_e32 v6, v9 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v8i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB73_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB73_4 +; VI-NEXT: .LBB73_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB73_3: +; VI-NEXT: s_branch .LBB73_2 +; VI-NEXT: .LBB73_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v8i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB73_4 +; GFX9-NEXT: .LBB73_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB73_3: +; GFX9-NEXT: s_branch .LBB73_2 +; GFX9-NEXT: .LBB73_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v8i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB73_4 +; GFX11-NEXT: .LBB73_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB73_3: +; GFX11-NEXT: s_branch .LBB73_2 +; GFX11-NEXT: .LBB73_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + define <2 x double> @bitcast_v8i16_to_v2f64(<8 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i16_to_v2f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_4 -; GCN-NEXT: .LBB37_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB37_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v8 -; GCN-NEXT: v_or_b32_e32 v1, v1, v11 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB37_2 -; GCN-NEXT: .LBB37_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_or_b32_e32 v1, v11, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i16_to_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v2 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB74_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB74_4 +; SI-NEXT: .LBB74_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB74_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_2 +; SI-NEXT: .LBB74_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v8, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v2f64: ; VI: ; %bb.0: @@ -6995,7 +13922,7 @@ define <2 x double> @bitcast_v8i16_to_v2f64(<8 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB37_2 +; VI-NEXT: s_cbranch_execz .LBB74_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 3 ; VI-NEXT: v_add_u16_e32 v4, 3, v3 @@ -7010,7 +13937,7 @@ define <2 x double> @bitcast_v8i16_to_v2f64(<8 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v4, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v4, v0 -; VI-NEXT: .LBB37_2: ; %end +; VI-NEXT: .LBB74_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7063,62 +13990,210 @@ end: ret <2 x double> %phi } +define inreg <2 x double> @bitcast_v8i16_to_v2f64_scalar(<8 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i16_to_v2f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB75_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_cbranch_execnz .LBB75_3 +; SI-NEXT: .LBB75_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: .LBB75_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB75_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; SI-NEXT: s_branch .LBB75_2 +; +; VI-LABEL: bitcast_v8i16_to_v2f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB75_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB75_3 +; VI-NEXT: .LBB75_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB75_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB75_4: +; VI-NEXT: s_branch .LBB75_2 +; +; GFX9-LABEL: bitcast_v8i16_to_v2f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB75_4 +; GFX9-NEXT: .LBB75_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB75_3: +; GFX9-NEXT: s_branch .LBB75_2 +; GFX9-NEXT: .LBB75_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v2f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB75_4 +; GFX11-NEXT: .LBB75_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB75_3: +; GFX11-NEXT: s_branch .LBB75_2 +; GFX11-NEXT: .LBB75_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f64_to_v8f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: .LBB38_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: .LBB38_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v9 -; GCN-NEXT: v_mov_b32_e32 v1, v11 -; GCN-NEXT: v_mov_b32_e32 v2, v8 -; GCN-NEXT: v_mov_b32_e32 v3, v10 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f64_to_v8f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: .LBB76_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: .LBB76_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v10 +; SI-NEXT: v_mov_b32_e32 v1, v11 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_mov_b32_e32 v3, v8 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v8f16: ; VI: ; %bb.0: @@ -7127,11 +14202,11 @@ define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: s_cbranch_execz .LBB76_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: .LBB76_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7142,11 +14217,11 @@ define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: s_cbranch_execz .LBB76_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: .LBB76_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7158,11 +14233,11 @@ define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: s_cbranch_execz .LBB76_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB38_2: ; %end +; GFX11-NEXT: .LBB76_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7182,83 +14257,207 @@ end: ret <8 x half> %phi } +define inreg <8 x half> @bitcast_v2f64_to_v8f16_scalar(<2 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f64_to_v8f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: .LBB77_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB77_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB77_2 +; +; VI-LABEL: bitcast_v2f64_to_v8f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB77_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB77_4 +; VI-NEXT: .LBB77_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB77_3: +; VI-NEXT: s_branch .LBB77_2 +; VI-NEXT: .LBB77_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v8f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB77_4 +; GFX9-NEXT: .LBB77_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB77_3: +; GFX9-NEXT: s_branch .LBB77_2 +; GFX9-NEXT: .LBB77_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v8f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB77_4 +; GFX11-NEXT: .LBB77_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB77_3: +; GFX11-NEXT: s_branch .LBB77_2 +; GFX11-NEXT: .LBB77_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f16_to_v2f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v6 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB39_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB39_4 -; GCN-NEXT: .LBB39_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB39_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v10, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB39_2 -; GCN-NEXT: .LBB39_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v6 -; GCN-NEXT: v_or_b32_e32 v3, v4, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f16_to_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB78_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB78_4 +; SI-NEXT: .LBB78_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB78_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_2 +; SI-NEXT: .LBB78_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v2f64: ; VI: ; %bb.0: @@ -7267,7 +14466,7 @@ define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB39_2 +; VI-NEXT: s_cbranch_execz .LBB78_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 0x200 ; VI-NEXT: v_add_f16_sdwa v5, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -7282,7 +14481,7 @@ define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v5 ; VI-NEXT: v_or_b32_e32 v0, v0, v4 -; VI-NEXT: .LBB39_2: ; %end +; VI-NEXT: .LBB78_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7336,54 +14535,216 @@ end: ret <2 x double> %phi } +define inreg <2 x double> @bitcast_v8f16_to_v2f64_scalar(<8 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f16_to_v2f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s22 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: .LBB79_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB79_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB79_2 +; +; VI-LABEL: bitcast_v8f16_to_v2f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB79_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB79_4 +; VI-NEXT: .LBB79_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v4, v1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB79_3: +; VI-NEXT: s_branch .LBB79_2 +; VI-NEXT: .LBB79_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v2f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB79_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB79_4 +; GFX9-NEXT: .LBB79_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB79_3: +; GFX9-NEXT: s_branch .LBB79_2 +; GFX9-NEXT: .LBB79_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v2f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB79_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB79_4 +; GFX11-NEXT: .LBB79_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB79_3: +; GFX11-NEXT: s_branch .LBB79_2 +; GFX11-NEXT: .LBB79_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + define <8 x bfloat> @bitcast_v2f64_to_v8bf16(<2 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f64_to_v8bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: .LBB40_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0 -; GCN-NEXT: .LBB40_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v11 -; GCN-NEXT: v_mov_b32_e32 v1, v10 -; GCN-NEXT: v_mov_b32_e32 v2, v9 -; GCN-NEXT: v_mov_b32_e32 v3, v8 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f64_to_v8bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v0 +; SI-NEXT: .LBB80_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v11 +; SI-NEXT: v_mov_b32_e32 v1, v10 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_mov_b32_e32 v3, v8 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v8bf16: ; VI: ; %bb.0: @@ -7392,11 +14753,11 @@ define <8 x bfloat> @bitcast_v2f64_to_v8bf16(<2 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: s_cbranch_execz .LBB80_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB40_2: ; %end +; VI-NEXT: .LBB80_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7407,28 +14768,153 @@ define <8 x bfloat> @bitcast_v2f64_to_v8bf16(<2 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: s_cbranch_execz .LBB80_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB40_2: ; %end +; GFX9-NEXT: .LBB80_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v2f64_to_v8bf16: +; GFX11-LABEL: bitcast_v2f64_to_v8bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB80_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB80_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + +define inreg <8 x bfloat> @bitcast_v2f64_to_v8bf16_scalar(<2 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f64_to_v8bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB81_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s13, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s19, 16 +; SI-NEXT: s_and_b32 s11, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s18, 16 +; SI-NEXT: s_and_b32 s9, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s17, 16 +; SI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB81_4 +; SI-NEXT: .LBB81_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[16:17], 1.0 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB81_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: s_branch .LBB81_2 +; SI-NEXT: .LBB81_4: +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v8bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB81_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB81_4 +; VI-NEXT: .LBB81_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB81_3: +; VI-NEXT: s_branch .LBB81_2 +; VI-NEXT: .LBB81_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v8bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB81_4 +; GFX9-NEXT: .LBB81_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB81_3: +; GFX9-NEXT: s_branch .LBB81_2 +; GFX9-NEXT: .LBB81_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v8bf16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB40_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB40_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB81_4 +; GFX11-NEXT: .LBB81_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB81_3: +; GFX11-NEXT: s_branch .LBB81_2 +; GFX11-NEXT: .LBB81_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7448,74 +14934,74 @@ end: } define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v8bf16_to_v2f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB41_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB41_4 -; GCN-NEXT: .LBB41_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB41_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; GCN-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB41_2 -; GCN-NEXT: .LBB41_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8bf16_to_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB82_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB82_4 +; SI-NEXT: .LBB82_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB82_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_2 +; SI-NEXT: .LBB82_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8bf16_to_v2f64: ; VI: ; %bb.0: @@ -7524,7 +15010,7 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB41_2 +; VI-NEXT: s_cbranch_execz .LBB82_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -7599,7 +15085,7 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7610,7 +15096,7 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB41_2 +; GFX9-NEXT: s_cbranch_execz .LBB82_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -7674,7 +15160,7 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; GFX9-NEXT: v_perm_b32 v0, v4, v0, s7 -; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: .LBB82_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7686,7 +15172,7 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -7761,7 +15247,7 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v0 -; GFX11-TRUE16-NEXT: .LBB41_2: ; %end +; GFX11-TRUE16-NEXT: .LBB82_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7773,7 +15259,7 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7839,7 +15325,7 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v7, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB41_2: ; %end +; GFX11-FAKE16-NEXT: .LBB82_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7859,68 +15345,429 @@ end: ret <2 x double> %phi } +define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8bf16_to_v2f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: s_cbranch_scc0 .LBB83_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: s_cbranch_execnz .LBB83_3 +; SI-NEXT: .LBB83_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: .LBB83_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB83_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB83_2 +; +; VI-LABEL: bitcast_v8bf16_to_v2f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB83_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB83_4 +; VI-NEXT: .LBB83_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB83_3: +; VI-NEXT: s_branch .LBB83_2 +; VI-NEXT: .LBB83_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v2f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB83_4 +; GFX9-NEXT: .LBB83_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v2 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v1 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB83_3: +; GFX9-NEXT: s_branch .LBB83_2 +; GFX9-NEXT: .LBB83_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v2f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB83_4 +; GFX11-NEXT: .LBB83_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s4, s3, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s3 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v7, 16, v8 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB83_3: +; GFX11-NEXT: s_branch .LBB83_2 +; GFX11-NEXT: .LBB83_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f64_to_v16i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v3 -; GCN-NEXT: v_mov_b32_e32 v16, v2 -; GCN-NEXT: v_mov_b32_e32 v19, v1 -; GCN-NEXT: v_mov_b32_e32 v18, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, v17, v16, 24 -; GCN-NEXT: v_alignbit_b32 v10, v17, v16, 16 -; GCN-NEXT: v_alignbit_b32 v9, v17, v16, 8 -; GCN-NEXT: v_alignbit_b32 v3, v19, v18, 24 -; GCN-NEXT: v_alignbit_b32 v2, v19, v18, 16 -; GCN-NEXT: v_alignbit_b32 v1, v19, v18, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v19 -; GCN-NEXT: .LBB42_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_alignbit_b32 v11, v17, v16, 24 -; GCN-NEXT: v_alignbit_b32 v10, v17, v16, 16 -; GCN-NEXT: v_alignbit_b32 v9, v17, v16, 8 -; GCN-NEXT: v_alignbit_b32 v3, v19, v18, 24 -; GCN-NEXT: v_alignbit_b32 v2, v19, v18, 16 -; GCN-NEXT: v_alignbit_b32 v1, v19, v18, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v19 -; GCN-NEXT: .LBB42_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v18 -; GCN-NEXT: v_mov_b32_e32 v4, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v16 -; GCN-NEXT: v_mov_b32_e32 v12, v17 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f64_to_v16i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v19, v1 +; SI-NEXT: v_mov_b32_e32 v18, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, v17, v16, 24 +; SI-NEXT: v_alignbit_b32 v10, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v9, v17, v16, 8 +; SI-NEXT: v_alignbit_b32 v3, v19, v18, 24 +; SI-NEXT: v_alignbit_b32 v2, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; SI-NEXT: .LBB84_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_alignbit_b32 v11, v17, v16, 24 +; SI-NEXT: v_alignbit_b32 v10, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v9, v17, v16, 8 +; SI-NEXT: v_alignbit_b32 v3, v19, v18, 24 +; SI-NEXT: v_alignbit_b32 v2, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; SI-NEXT: .LBB84_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v18 +; SI-NEXT: v_mov_b32_e32 v4, v19 +; SI-NEXT: v_mov_b32_e32 v8, v16 +; SI-NEXT: v_mov_b32_e32 v12, v17 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v16i8: ; VI: ; %bb.0: @@ -7944,7 +15791,7 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: s_cbranch_execz .LBB84_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -7958,9 +15805,9 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB42_2: ; %Flow +; VI-NEXT: .LBB84_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB42_4 +; VI-NEXT: s_cbranch_execz .LBB84_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -7976,7 +15823,7 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB42_4: ; %end +; VI-NEXT: .LBB84_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v18 ; VI-NEXT: v_mov_b32_e32 v4, v19 @@ -8006,7 +15853,7 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: s_cbranch_execz .LBB84_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -8020,9 +15867,9 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB42_2: ; %Flow +; GFX9-NEXT: .LBB84_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB42_4 +; GFX9-NEXT: s_cbranch_execz .LBB84_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -8038,7 +15885,7 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB42_4: ; %end +; GFX9-NEXT: .LBB84_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v18 ; GFX9-NEXT: v_mov_b32_e32 v4, v19 @@ -8062,7 +15909,7 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB84_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 @@ -8072,9 +15919,9 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[16:17] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB42_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB84_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB84_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -8087,7 +15934,7 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB42_4: ; %end +; GFX11-TRUE16-NEXT: .LBB84_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -8120,7 +15967,7 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB84_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -8134,9 +15981,9 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB42_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB84_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB84_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -8153,7 +16000,7 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB42_4: ; %end +; GFX11-FAKE16-NEXT: .LBB84_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 @@ -8177,127 +16024,505 @@ end: ret <16 x i8> %phi } +define inreg <16 x i8> @bitcast_v2f64_to_v16i8_scalar(<2 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f64_to_v16i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB85_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s11, s19, 24 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 8 +; SI-NEXT: s_lshr_b32 s8, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB85_4 +; SI-NEXT: .LBB85_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[16:17], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[18:19], s[16:17], 1.0 +; SI-NEXT: v_alignbit_b32 v11, v17, v16, 24 +; SI-NEXT: v_alignbit_b32 v10, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v9, v17, v16, 8 +; SI-NEXT: v_alignbit_b32 v3, v19, v18, 24 +; SI-NEXT: v_alignbit_b32 v2, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; SI-NEXT: s_branch .LBB85_5 +; SI-NEXT: .LBB85_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: s_branch .LBB85_2 +; SI-NEXT: .LBB85_4: +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v15, s11 +; SI-NEXT: v_mov_b32_e32 v14, s10 +; SI-NEXT: v_mov_b32_e32 v13, s9 +; SI-NEXT: .LBB85_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v18 +; SI-NEXT: v_mov_b32_e32 v4, v19 +; SI-NEXT: v_mov_b32_e32 v8, v16 +; SI-NEXT: v_mov_b32_e32 v12, v17 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v16i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB85_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s15, s19, 24 +; VI-NEXT: s_lshr_b32 s14, s19, 16 +; VI-NEXT: s_lshr_b32 s13, s19, 8 +; VI-NEXT: s_lshr_b32 s21, s18, 16 +; VI-NEXT: s_lshr_b32 s20, s18, 8 +; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s11, s17, 16 +; VI-NEXT: s_lshr_b32 s10, s17, 8 +; VI-NEXT: s_lshr_b32 s23, s16, 16 +; VI-NEXT: s_lshr_b32 s22, s16, 8 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB85_4 +; VI-NEXT: .LBB85_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[16:17], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[18:19], s[16:17], 1.0 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: s_branch .LBB85_5 +; VI-NEXT: .LBB85_3: +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr23 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr21 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: s_branch .LBB85_2 +; VI-NEXT: .LBB85_4: +; VI-NEXT: v_mov_b32_e32 v18, s16 +; VI-NEXT: v_mov_b32_e32 v16, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v19, s17 +; VI-NEXT: v_mov_b32_e32 v2, s23 +; VI-NEXT: v_mov_b32_e32 v1, s22 +; VI-NEXT: v_mov_b32_e32 v10, s21 +; VI-NEXT: v_mov_b32_e32 v9, s20 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v15, s15 +; VI-NEXT: v_mov_b32_e32 v14, s14 +; VI-NEXT: v_mov_b32_e32 v13, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v6, s11 +; VI-NEXT: v_mov_b32_e32 v5, s10 +; VI-NEXT: .LBB85_5: ; %end +; VI-NEXT: v_mov_b32_e32 v0, v18 +; VI-NEXT: v_mov_b32_e32 v4, v19 +; VI-NEXT: v_mov_b32_e32 v8, v16 +; VI-NEXT: v_mov_b32_e32 v12, v17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v16i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s15, s19, 24 +; GFX9-NEXT: s_lshr_b32 s14, s19, 16 +; GFX9-NEXT: s_lshr_b32 s13, s19, 8 +; GFX9-NEXT: s_lshr_b32 s21, s18, 16 +; GFX9-NEXT: s_lshr_b32 s20, s18, 8 +; GFX9-NEXT: s_lshr_b32 s12, s17, 24 +; GFX9-NEXT: s_lshr_b32 s11, s17, 16 +; GFX9-NEXT: s_lshr_b32 s10, s17, 8 +; GFX9-NEXT: s_lshr_b32 s23, s16, 16 +; GFX9-NEXT: s_lshr_b32 s22, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB85_4 +; GFX9-NEXT: .LBB85_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[16:17], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], s[16:17], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: s_branch .LBB85_5 +; GFX9-NEXT: .LBB85_3: +; GFX9-NEXT: ; implicit-def: $sgpr22 +; GFX9-NEXT: ; implicit-def: $sgpr23 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr20 +; GFX9-NEXT: ; implicit-def: $sgpr21 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: s_branch .LBB85_2 +; GFX9-NEXT: .LBB85_4: +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v16, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s23 +; GFX9-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-NEXT: v_mov_b32_e32 v10, s21 +; GFX9-NEXT: v_mov_b32_e32 v9, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s15 +; GFX9-NEXT: v_mov_b32_e32 v14, s14 +; GFX9-NEXT: v_mov_b32_e32 v13, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-NEXT: .LBB85_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v2f64_to_v16i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB85_4 +; GFX11-TRUE16-NEXT: .LBB85_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], s[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[3:4], s[0:1], 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB85_5 +; GFX11-TRUE16-NEXT: .LBB85_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB85_2 +; GFX11-TRUE16-NEXT: .LBB85_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s14 +; GFX11-TRUE16-NEXT: .LBB85_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v2f64_to_v16i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB85_4 +; GFX11-FAKE16-NEXT: .LBB85_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], s[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], s[0:1], 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-FAKE16-NEXT: s_branch .LBB85_5 +; GFX11-FAKE16-NEXT: .LBB85_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr18 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: s_branch .LBB85_2 +; GFX11-FAKE16-NEXT: .LBB85_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v17, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v1, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v14, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s6 :: v_dual_mov_b32 v6, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s14 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, s12 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, s11 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s9 +; GFX11-FAKE16-NEXT: .LBB85_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v16 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v17 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i8_to_v2f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v15 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB43_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB43_4 -; GCN-NEXT: .LBB43_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB43_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v0, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v8, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v9, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB43_2 -; GCN-NEXT: .LBB43_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v19, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v21, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v11, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x300, v8 -; GCN-NEXT: v_or_b32_e32 v7, v9, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i8_to_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB86_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB86_4 +; SI-NEXT: .LBB86_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB86_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB86_2 +; SI-NEXT: .LBB86_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v2f64: ; VI: ; %bb.0: @@ -8316,14 +16541,14 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: s_cbranch_execnz .LBB86_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB43_4 -; VI-NEXT: .LBB43_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB86_4 +; VI-NEXT: .LBB86_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB43_3: ; %cmp.false +; VI-NEXT: .LBB86_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -8353,8 +16578,8 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB43_2 -; VI-NEXT: .LBB43_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB86_2 +; VI-NEXT: .LBB86_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v17 ; VI-NEXT: v_add_u16_e32 v1, 3, v18 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -8404,14 +16629,14 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: s_cbranch_execnz .LBB86_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB43_4 -; GFX9-NEXT: .LBB43_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB86_4 +; GFX9-NEXT: .LBB86_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB43_3: ; %cmp.false +; GFX9-NEXT: .LBB86_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -8441,8 +16666,8 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB43_2 -; GFX9-NEXT: .LBB43_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB86_2 +; GFX9-NEXT: .LBB86_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -8496,14 +16721,14 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_4 -; GFX11-TRUE16-NEXT: .LBB43_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_4 +; GFX11-TRUE16-NEXT: .LBB86_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB43_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l @@ -8555,8 +16780,8 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB43_2 -; GFX11-TRUE16-NEXT: .LBB43_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 +; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 @@ -8626,14 +16851,14 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_4 -; GFX11-FAKE16-NEXT: .LBB43_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_4 +; GFX11-FAKE16-NEXT: .LBB86_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB43_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -8680,8 +16905,8 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB43_2 -; GFX11-FAKE16-NEXT: .LBB43_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB86_2 +; GFX11-FAKE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v17, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v18, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -8746,74 +16971,524 @@ end: ret <2 x double> %phi } +define inreg <2 x double> @bitcast_v16i8_to_v2f64_scalar(<16 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i8_to_v2f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v1 +; SI-NEXT: s_cbranch_scc0 .LBB87_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v3, s7, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB87_3 +; SI-NEXT: .LBB87_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB87_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB87_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB87_2 +; +; VI-LABEL: bitcast_v16i8_to_v2f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_mov_b32_e32 v5, v1 +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB87_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s7, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB87_3 +; VI-NEXT: .LBB87_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s19, 24 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v4 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s28, 0xff +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB87_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB87_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_branch .LBB87_2 +; +; GFX9-LABEL: bitcast_v16i8_to_v2f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB87_3 +; GFX9-NEXT: .LBB87_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: .LBB87_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB87_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB87_2 +; +; GFX11-LABEL: bitcast_v16i8_to_v2f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s21, 8 +; GFX11-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-NEXT: s_or_b32 s9, s9, s10 +; GFX11-NEXT: s_or_b32 s10, s11, s12 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB87_3 +; GFX11-NEXT: .LBB87_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s21, 8 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s23, 8 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s25, 8 +; GFX11-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s7, s6 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s6, s0, s1 +; GFX11-NEXT: s_or_b32 s7, s2, s3 +; GFX11-NEXT: .LBB87_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB87_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB87_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + define <8 x half> @bitcast_v8i16_to_v8f16(<8 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i16_to_v8f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v16, v7 -; GCN-NEXT: v_mov_b32_e32 v9, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v11, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v13, v2 -; GCN-NEXT: v_mov_b32_e32 v14, v1 -; GCN-NEXT: v_mov_b32_e32 v15, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB44_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB44_4 -; GCN-NEXT: .LBB44_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB44_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB44_2 -; GCN-NEXT: .LBB44_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i16_to_v8f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v9, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v11, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v1 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB88_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB88_4 +; SI-NEXT: .LBB88_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB88_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_2 +; SI-NEXT: .LBB88_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v8f16: ; VI: ; %bb.0: @@ -8822,7 +17497,7 @@ define <8 x half> @bitcast_v8i16_to_v8f16(<8 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB44_2 +; VI-NEXT: s_cbranch_execz .LBB88_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 3 ; VI-NEXT: v_add_u16_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -8837,7 +17512,7 @@ define <8 x half> @bitcast_v8i16_to_v8f16(<8 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v7 ; VI-NEXT: v_or_b32_e32 v1, v1, v6 ; VI-NEXT: v_or_b32_e32 v0, v0, v5 -; VI-NEXT: .LBB44_2: ; %end +; VI-NEXT: .LBB88_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8857,21 +17532,164 @@ define <8 x half> @bitcast_v8i16_to_v8f16(<8 x i16> %a, i32 %b) { ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v8i16_to_v8f16: +; GFX11-LABEL: bitcast_v8i16_to_v8f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + +define inreg <8 x half> @bitcast_v8i16_to_v8f16_scalar(<8 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i16_to_v8f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB89_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: s_cbranch_execnz .LBB89_3 +; SI-NEXT: .LBB89_2: ; %cmp.true +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: .LBB89_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB89_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB89_2 +; +; VI-LABEL: bitcast_v8i16_to_v8f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB89_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB89_3 +; VI-NEXT: .LBB89_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB89_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB89_4: +; VI-NEXT: s_branch .LBB89_2 +; +; GFX9-LABEL: bitcast_v8i16_to_v8f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB89_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB89_4 +; GFX9-NEXT: .LBB89_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB89_3: +; GFX9-NEXT: s_branch .LBB89_2 +; GFX9-NEXT: .LBB89_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v8f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB89_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB89_4 +; GFX11-NEXT: .LBB89_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB89_3: +; GFX11-NEXT: s_branch .LBB89_2 +; GFX11-NEXT: .LBB89_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -8891,60 +17709,60 @@ end: } define <8 x i16> @bitcast_v8f16_to_v8i16(<8 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f16_to_v8i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB45_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v8 -; GCN-NEXT: v_or_b32_e32 v2, v2, v9 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: .LBB45_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f16_to_v8i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB90_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: .LBB90_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v8i16: ; VI: ; %bb.0: @@ -8953,7 +17771,7 @@ define <8 x i16> @bitcast_v8f16_to_v8i16(<8 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB45_2 +; VI-NEXT: s_cbranch_execz .LBB90_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 0x200 ; VI-NEXT: v_add_f16_e32 v4, 0x200, v0 @@ -8968,7 +17786,7 @@ define <8 x i16> @bitcast_v8f16_to_v8i16(<8 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v7, v2 ; VI-NEXT: v_or_b32_e32 v1, v6, v1 ; VI-NEXT: v_or_b32_e32 v0, v4, v0 -; VI-NEXT: .LBB45_2: ; %end +; VI-NEXT: .LBB90_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9022,71 +17840,227 @@ end: ret <8 x i16> %phi } +define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f16_to_v8i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB91_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB91_3 +; SI-NEXT: .LBB91_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: .LBB91_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB91_4: +; SI-NEXT: s_branch .LBB91_2 +; +; VI-LABEL: bitcast_v8f16_to_v8i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB91_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB91_4 +; VI-NEXT: .LBB91_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_add_f16_e32 v4, s16, v0 +; VI-NEXT: v_add_f16_sdwa v5, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v6, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v7, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v0, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v0 +; VI-NEXT: v_or_b32_e32 v2, v2, v7 +; VI-NEXT: v_or_b32_e32 v1, v1, v6 +; VI-NEXT: v_or_b32_e32 v0, v4, v5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB91_3: +; VI-NEXT: s_branch .LBB91_2 +; VI-NEXT: .LBB91_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v8i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB91_4 +; GFX9-NEXT: .LBB91_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB91_3: +; GFX9-NEXT: s_branch .LBB91_2 +; GFX9-NEXT: .LBB91_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v8i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB91_4 +; GFX11-NEXT: .LBB91_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB91_3: +; GFX11-NEXT: s_branch .LBB91_2 +; GFX11-NEXT: .LBB91_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + define <8 x bfloat> @bitcast_v8i16_to_v8bf16(<8 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i16_to_v8bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v11, v6 -; GCN-NEXT: v_mov_b32_e32 v12, v4 -; GCN-NEXT: v_mov_b32_e32 v9, v2 -; GCN-NEXT: v_mov_b32_e32 v10, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB46_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB46_4 -; GCN-NEXT: .LBB46_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB46_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB46_2 -; GCN-NEXT: .LBB46_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v11 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v7, v0 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i16_to_v8bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v6 +; SI-NEXT: v_mov_b32_e32 v12, v4 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB92_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB92_4 +; SI-NEXT: .LBB92_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB92_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_2 +; SI-NEXT: .LBB92_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v8bf16: ; VI: ; %bb.0: @@ -9095,7 +18069,7 @@ define <8 x bfloat> @bitcast_v8i16_to_v8bf16(<8 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB46_2 +; VI-NEXT: s_cbranch_execz .LBB92_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 3 ; VI-NEXT: v_add_u16_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -9110,7 +18084,7 @@ define <8 x bfloat> @bitcast_v8i16_to_v8bf16(<8 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v7 ; VI-NEXT: v_or_b32_e32 v1, v1, v6 ; VI-NEXT: v_or_b32_e32 v0, v0, v5 -; VI-NEXT: .LBB46_2: ; %end +; VI-NEXT: .LBB92_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9163,86 +18137,249 @@ end: ret <8 x bfloat> %phi } +define inreg <8 x bfloat> @bitcast_v8i16_to_v8bf16_scalar(<8 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i16_to_v8bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB93_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_lshl_b32 s8, s18, 16 +; SI-NEXT: s_lshl_b32 s9, s19, 16 +; SI-NEXT: s_lshl_b32 s10, s20, 16 +; SI-NEXT: s_lshl_b32 s11, s21, 16 +; SI-NEXT: s_lshl_b32 s13, s22, 16 +; SI-NEXT: s_lshl_b32 s12, s23, 16 +; SI-NEXT: s_cbranch_execnz .LBB93_3 +; SI-NEXT: .LBB93_2: ; %cmp.true +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xffff +; SI-NEXT: s_lshl_b32 s6, s21, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s19, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s8, s6, 0x30000 +; SI-NEXT: s_and_b32 s6, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s11, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s5, 16 +; SI-NEXT: s_and_b32 s12, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s4, 16 +; SI-NEXT: .LBB93_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB93_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: s_branch .LBB93_2 +; +; VI-LABEL: bitcast_v8i16_to_v8bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB93_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB93_3 +; VI-NEXT: .LBB93_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB93_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB93_4: +; VI-NEXT: s_branch .LBB93_2 +; +; GFX9-LABEL: bitcast_v8i16_to_v8bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB93_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB93_4 +; GFX9-NEXT: .LBB93_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB93_3: +; GFX9-NEXT: s_branch .LBB93_2 +; GFX9-NEXT: .LBB93_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v8bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB93_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB93_4 +; GFX11-NEXT: .LBB93_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB93_3: +; GFX11-NEXT: s_branch .LBB93_2 +; GFX11-NEXT: .LBB93_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v8bf16_to_v8i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v7 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB47_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB47_4 -; GCN-NEXT: .LBB47_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB47_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v10 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB47_2 -; GCN-NEXT: .LBB47_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_alignbit_b32 v0, v9, v0, 16 -; GCN-NEXT: v_alignbit_b32 v4, v10, v2, 16 -; GCN-NEXT: v_alignbit_b32 v6, v7, v8, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v5, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v11, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8bf16_to_v8i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB94_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB94_4 +; SI-NEXT: .LBB94_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB94_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB94_2 +; SI-NEXT: .LBB94_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8bf16_to_v8i16: ; VI: ; %bb.0: @@ -9251,7 +18388,7 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB47_2 +; VI-NEXT: s_cbranch_execz .LBB94_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -9326,7 +18463,7 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v2, v2, v6, 16 ; VI-NEXT: v_alignbit_b32 v1, v1, v5, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; VI-NEXT: .LBB47_2: ; %end +; VI-NEXT: .LBB94_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9337,7 +18474,7 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB47_2 +; GFX9-NEXT: s_cbranch_execz .LBB94_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -9401,7 +18538,7 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v6, v2, s6 ; GFX9-NEXT: v_perm_b32 v1, v5, v1, s6 ; GFX9-NEXT: v_perm_b32 v0, v4, v0, s6 -; GFX9-NEXT: .LBB47_2: ; %end +; GFX9-NEXT: .LBB94_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9413,7 +18550,7 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 @@ -9492,7 +18629,7 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v7, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v6, 16, v5 -; GFX11-TRUE16-NEXT: .LBB47_2: ; %end +; GFX11-TRUE16-NEXT: .LBB94_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9504,7 +18641,7 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9569,7 +18706,7 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB47_2: ; %end +; GFX11-FAKE16-NEXT: .LBB94_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9589,109 +18726,473 @@ end: ret <8 x i16> %phi } +define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8bf16_to_v8i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 +; SI-NEXT: s_cbranch_scc0 .LBB95_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; SI-NEXT: s_cbranch_execnz .LBB95_3 +; SI-NEXT: .LBB95_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: .LBB95_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB95_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB95_2 +; +; VI-LABEL: bitcast_v8bf16_to_v8i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB95_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB95_4 +; VI-NEXT: .LBB95_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; VI-NEXT: v_bfe_u32 v8, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; VI-NEXT: v_alignbit_b32 v2, v7, v2, 16 +; VI-NEXT: v_alignbit_b32 v1, v6, v1, 16 +; VI-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB95_3: +; VI-NEXT: s_branch .LBB95_2 +; VI-NEXT: .LBB95_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v8i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB95_4 +; GFX9-NEXT: .LBB95_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v6, vcc +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v3 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s19 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GFX9-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v3 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v3, v3, v8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GFX9-NEXT: v_and_or_b32 v2, v2, v8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX9-NEXT: v_and_or_b32 v1, v1, v8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX9-NEXT: v_and_or_b32 v0, v4, v8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB95_3: +; GFX9-NEXT: s_branch .LBB95_2 +; GFX9-NEXT: .LBB95_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v8i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB95_4 +; GFX11-NEXT: .LBB95_2: ; %cmp.true +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s4 +; GFX11-NEXT: s_lshl_b32 s0, s1, 16 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v2 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s3, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4 +; GFX11-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v4, v10, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v9 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3 +; GFX11-NEXT: v_bfe_u32 v3, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v5 +; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v3, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v5, v4 +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v3, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v6, v8 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB95_3: +; GFX11-NEXT: s_branch .LBB95_2 +; GFX11-NEXT: .LBB95_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i16_to_v16i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v7 -; GCN-NEXT: v_mov_b32_e32 v16, v6 -; GCN-NEXT: v_mov_b32_e32 v17, v4 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v19, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB48_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB48_4 -; GCN-NEXT: .LBB48_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB48_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v20 -; GCN-NEXT: v_bfe_u32 v7, v21, 8, 8 -; GCN-NEXT: v_or_b32_e32 v0, v0, v24 -; GCN-NEXT: v_or_b32_e32 v4, v1, v25 -; GCN-NEXT: v_or_b32_e32 v8, v2, v22 -; GCN-NEXT: v_or_b32_e32 v12, v3, v23 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_bfe_u32 v15, v20, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB48_2 -; GCN-NEXT: .LBB48_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v18 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v22, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_or_b32_e32 v2, v24, v2 -; GCN-NEXT: v_or_b32_e32 v3, v25, v3 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v3 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i16_to_v16i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v7 +; SI-NEXT: v_mov_b32_e32 v21, v3 +; SI-NEXT: v_mov_b32_e32 v16, v6 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v20 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB96_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB96_4 +; SI-NEXT: .LBB96_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB96_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v8, v5, v24 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 +; SI-NEXT: v_or_b32_e32 v4, v1, v23 +; SI-NEXT: v_or_b32_e32 v12, v5, v22 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v20 +; SI-NEXT: v_bfe_u32 v7, v21, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v20, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB96_2 +; SI-NEXT: .LBB96_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v16i8: ; VI: ; %bb.0: @@ -9717,7 +19218,7 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: s_cbranch_execz .LBB96_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v19 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v19 @@ -9733,9 +19234,9 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v21, v19 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: .LBB48_2: ; %Flow +; VI-NEXT: .LBB96_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: s_cbranch_execz .LBB96_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 3 ; VI-NEXT: v_add_u16_sdwa v6, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -9762,7 +19263,7 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v0 ; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; VI-NEXT: .LBB48_4: ; %end +; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v16 ; VI-NEXT: v_mov_b32_e32 v1, v20 @@ -9792,7 +19293,7 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB48_2 +; GFX9-NEXT: s_cbranch_execz .LBB96_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -9806,9 +19307,9 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB48_2: ; %Flow +; GFX9-NEXT: .LBB96_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB48_4 +; GFX9-NEXT: s_cbranch_execz .LBB96_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] @@ -9826,7 +19327,7 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB48_4: ; %end +; GFX9-NEXT: .LBB96_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v18 ; GFX9-NEXT: v_mov_b32_e32 v4, v19 @@ -9850,7 +19351,7 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 @@ -9860,9 +19361,9 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[16:17] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB96_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] @@ -9878,7 +19379,7 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB48_4: ; %end +; GFX11-TRUE16-NEXT: .LBB96_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -9911,7 +19412,7 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -9925,9 +19426,9 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB96_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] @@ -9947,8 +19448,437 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB48_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB96_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v16 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v17 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + +define inreg <16 x i8> @bitcast_v8i16_to_v16i8_scalar(<8 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i16_to_v16i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB97_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: v_alignbit_b32 v3, s8, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s8, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s8, v0, 8 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_alignbit_b32 v11, s9, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s9, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s9, v0, 8 +; SI-NEXT: s_lshr_b32 s10, s8, 8 +; SI-NEXT: s_lshr_b32 s13, s9, 8 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_and_b32 s14, s23, 0xffff +; SI-NEXT: s_bfe_u32 s12, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s15, s23, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB97_3 +; SI-NEXT: .LBB97_2: ; %cmp.true +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s9, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s8, s4, 0x30000 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_alignbit_b32 v3, s8, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s8, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s8, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_alignbit_b32 v11, s9, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s9, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s9, v0, 8 +; SI-NEXT: s_lshr_b32 s12, s8, 24 +; SI-NEXT: s_lshr_b32 s11, s8, 16 +; SI-NEXT: s_lshr_b32 s10, s8, 8 +; SI-NEXT: s_lshr_b32 s15, s9, 24 +; SI-NEXT: s_lshr_b32 s14, s9, 16 +; SI-NEXT: s_lshr_b32 s13, s9, 8 +; SI-NEXT: .LBB97_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v6, s11 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s7 +; SI-NEXT: v_mov_b32_e32 v12, s9 +; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: v_mov_b32_e32 v14, s14 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB97_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: s_branch .LBB97_2 +; +; VI-LABEL: bitcast_v8i16_to_v16i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB97_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s19, 24 +; VI-NEXT: s_lshr_b32 s11, s19, 16 +; VI-NEXT: s_lshr_b32 s12, s19, 8 +; VI-NEXT: s_lshr_b32 s13, s18, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s20, s17, 16 +; VI-NEXT: s_lshr_b32 s21, s17, 8 +; VI-NEXT: s_lshr_b32 s22, s16, 16 +; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB97_3 +; VI-NEXT: .LBB97_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s7, s19, 3 +; VI-NEXT: s_add_i32 s9, s16, 3 +; VI-NEXT: s_add_i32 s11, s17, 3 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s17, s10, 0x30000 +; VI-NEXT: s_add_i32 s16, s8, 0x30000 +; VI-NEXT: s_add_i32 s19, s6, 0x30000 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s10, s19, 24 +; VI-NEXT: s_lshr_b32 s11, s19, 16 +; VI-NEXT: s_lshr_b32 s12, s19, 8 +; VI-NEXT: s_lshr_b32 s13, s18, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s20, s17, 16 +; VI-NEXT: s_lshr_b32 s21, s17, 8 +; VI-NEXT: s_lshr_b32 s22, s16, 16 +; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: .LBB97_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s23 +; VI-NEXT: v_mov_b32_e32 v2, s22 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s14 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s12 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB97_4: +; VI-NEXT: ; implicit-def: $sgpr23 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr21 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB97_2 +; +; GFX9-LABEL: bitcast_v8i16_to_v16i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s19, 24 +; GFX9-NEXT: s_lshr_b32 s11, s19, 16 +; GFX9-NEXT: s_lshr_b32 s13, s19, 8 +; GFX9-NEXT: s_lshr_b32 s12, s18, 16 +; GFX9-NEXT: s_lshr_b32 s14, s18, 8 +; GFX9-NEXT: s_lshr_b32 s15, s17, 24 +; GFX9-NEXT: s_lshr_b32 s20, s17, 16 +; GFX9-NEXT: s_lshr_b32 s22, s17, 8 +; GFX9-NEXT: s_lshr_b32 s21, s16, 16 +; GFX9-NEXT: s_lshr_b32 s23, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB97_4 +; GFX9-NEXT: .LBB97_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v19, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: s_branch .LBB97_5 +; GFX9-NEXT: .LBB97_3: +; GFX9-NEXT: ; implicit-def: $sgpr23 +; GFX9-NEXT: ; implicit-def: $sgpr21 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr22 +; GFX9-NEXT: ; implicit-def: $sgpr20 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB97_2 +; GFX9-NEXT: .LBB97_4: +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v16, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s21 +; GFX9-NEXT: v_mov_b32_e32 v5, s22 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: v_mov_b32_e32 v10, s12 +; GFX9-NEXT: v_mov_b32_e32 v13, s13 +; GFX9-NEXT: v_mov_b32_e32 v14, s11 +; GFX9-NEXT: v_mov_b32_e32 v15, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB97_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v8i16_to_v16i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB97_4 +; GFX11-TRUE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB97_5 +; GFX11-TRUE16-NEXT: .LBB97_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB97_2 +; GFX11-TRUE16-NEXT: .LBB97_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: .LBB97_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v8i16_to_v16i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB97_4 +; GFX11-FAKE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-FAKE16-NEXT: s_branch .LBB97_5 +; GFX11-FAKE16-NEXT: .LBB97_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr18 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: s_branch .LBB97_2 +; GFX11-FAKE16-NEXT: .LBB97_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v19, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v17, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s18 :: v_dual_mov_b32 v2, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s14 :: v_dual_mov_b32 v10, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v14, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, s12 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s9 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB97_5: ; %end ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v16 @@ -9972,139 +19902,139 @@ end: } define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i8_to_v8i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v9 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v1, v1, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v4, v4, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v22 -; GCN-NEXT: v_or_b32_e32 v7, v7, v23 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v3, v16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v17, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v8, v18, v5 -; GCN-NEXT: v_or_b32_e32 v5, v19, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v11, v1, v3 -; GCN-NEXT: v_or_b32_e32 v15, v4, v8 -; GCN-NEXT: v_or_b32_e32 v9, v0, v2 -; GCN-NEXT: v_or_b32_e32 v13, v6, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v1, v11, v2, 16 -; GCN-NEXT: v_alignbit_b32 v5, v15, v5, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: .LBB49_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v10 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v5, v21, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v22, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v4, v20, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; GCN-NEXT: v_or_b32_e32 v3, v19, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_or_b32_e32 v7, v18, v7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_or_b32_e32 v2, v17, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v6, v16, v6 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_or_b32_e32 v3, v7, v5 -; GCN-NEXT: v_or_b32_e32 v0, v2, v0 -; GCN-NEXT: v_or_b32_e32 v2, v6, v4 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v2 -; GCN-NEXT: v_alignbit_b32 v1, v11, v9, 16 -; GCN-NEXT: v_alignbit_b32 v5, v15, v13, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-NEXT: .LBB49_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v9 -; GCN-NEXT: v_mov_b32_e32 v2, v11 -; GCN-NEXT: v_mov_b32_e32 v4, v13 -; GCN-NEXT: v_mov_b32_e32 v6, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i8_to_v8i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v16, v2 +; SI-NEXT: v_or_b32_e32 v2, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_or_b32_e32 v4, v21, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v7, v15, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v13, v0, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_or_b32_e32 v10, v23, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v10, 16 +; SI-NEXT: v_or_b32_e32 v4, v0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: .LBB98_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v14 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v17 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v0 +; SI-NEXT: v_alignbit_b32 v1, v2, v13, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: .LBB98_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v13 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v8i16: ; VI: ; %bb.0: @@ -10123,14 +20053,14 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: s_cbranch_execnz .LBB98_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB49_4 -; VI-NEXT: .LBB49_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB98_4 +; VI-NEXT: .LBB98_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB49_3: ; %cmp.false +; VI-NEXT: .LBB98_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -10160,8 +20090,8 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB49_2 -; VI-NEXT: .LBB49_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB98_2 +; VI-NEXT: .LBB98_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v17 ; VI-NEXT: v_add_u16_e32 v1, 3, v18 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -10211,14 +20141,14 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: s_cbranch_execnz .LBB98_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB49_4 -; GFX9-NEXT: .LBB49_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB98_4 +; GFX9-NEXT: .LBB98_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB49_3: ; %cmp.false +; GFX9-NEXT: .LBB98_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -10248,8 +20178,8 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB49_2 -; GFX9-NEXT: .LBB49_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB98_2 +; GFX9-NEXT: .LBB98_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -10303,14 +20233,14 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_4 -; GFX11-TRUE16-NEXT: .LBB49_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_4 +; GFX11-TRUE16-NEXT: .LBB98_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB49_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l @@ -10362,8 +20292,8 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-TRUE16-NEXT: .LBB49_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 @@ -10433,14 +20363,14 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB98_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_4 -; GFX11-FAKE16-NEXT: .LBB49_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB98_4 +; GFX11-FAKE16-NEXT: .LBB98_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB49_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -10487,8 +20417,8 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-FAKE16-NEXT: .LBB49_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-FAKE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v17, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v18, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -10553,90 +20483,561 @@ end: ret <8 x i16> %phi } +define inreg <8 x i16> @bitcast_v16i8_to_v8i16_scalar(<16 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i8_to_v8i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: s_cbranch_scc0 .LBB99_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_or_b32 s4, s7, s4 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s9, s29, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s8, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s6, 24 +; SI-NEXT: s_or_b32 s13, s11, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s27, 24 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s11, s16, 0xff +; SI-NEXT: s_lshl_b32 s12, s17, 8 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_or_b32 s11, s11, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s12, s25, 8 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s4, s4, s12 +; SI-NEXT: v_alignbit_b32 v1, s10, v0, 16 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_alignbit_b32 v5, s7, v0, 16 +; SI-NEXT: s_or_b32 s9, s4, s9 +; SI-NEXT: s_lshr_b32 s12, s5, 16 +; SI-NEXT: s_lshr_b32 s13, s13, 16 +; SI-NEXT: s_cbranch_execnz .LBB99_3 +; SI-NEXT: .LBB99_2: ; %cmp.true +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s9, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_lshl_b32 s5, s6, 24 +; SI-NEXT: s_and_b32 s6, s8, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s7, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s11, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s10, s4, 0x3000000 +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_alignbit_b32 v1, s10, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_alignbit_b32 v5, s7, v0, 16 +; SI-NEXT: s_lshr_b32 s12, s10, 16 +; SI-NEXT: s_lshr_b32 s13, s7, 16 +; SI-NEXT: .LBB99_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s12 +; SI-NEXT: v_mov_b32_e32 v4, s9 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB99_4: +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: s_branch .LBB99_2 +; +; VI-LABEL: bitcast_v16i8_to_v8i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_mov_b32_e32 v5, v1 +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB99_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s7, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB99_3 +; VI-NEXT: .LBB99_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s19, 24 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v4 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s28, 0xff +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB99_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB99_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_branch .LBB99_2 +; +; GFX9-LABEL: bitcast_v16i8_to_v8i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB99_3 +; GFX9-NEXT: .LBB99_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: .LBB99_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB99_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB99_2 +; +; GFX11-LABEL: bitcast_v16i8_to_v8i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s21, 8 +; GFX11-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-NEXT: s_or_b32 s9, s9, s10 +; GFX11-NEXT: s_or_b32 s10, s11, s12 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB99_3 +; GFX11-NEXT: .LBB99_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s21, 8 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s23, 8 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s25, 8 +; GFX11-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s7, s6 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s6, s0, s1 +; GFX11-NEXT: s_or_b32 s7, s2, s3 +; GFX11-NEXT: .LBB99_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB99_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB99_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + define <8 x bfloat> @bitcast_v8f16_to_v8bf16(<8 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f16_to_v8bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v7 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB50_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB50_4 -; GCN-NEXT: .LBB50_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB50_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v15 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB50_2 -; GCN-NEXT: .LBB50_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v8 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f16_to_v8bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB100_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB100_4 +; SI-NEXT: .LBB100_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB100_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_2 +; SI-NEXT: .LBB100_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v8bf16: ; VI: ; %bb.0: @@ -10645,7 +21046,7 @@ define <8 x bfloat> @bitcast_v8f16_to_v8bf16(<8 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: s_cbranch_execz .LBB100_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 0x200 ; VI-NEXT: v_add_f16_e32 v4, 0x200, v0 @@ -10660,7 +21061,7 @@ define <8 x bfloat> @bitcast_v8f16_to_v8bf16(<8 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v7, v2 ; VI-NEXT: v_or_b32_e32 v1, v6, v1 ; VI-NEXT: v_or_b32_e32 v0, v4, v0 -; VI-NEXT: .LBB50_2: ; %end +; VI-NEXT: .LBB100_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -10714,98 +21115,268 @@ end: ret <8 x bfloat> %phi } +define inreg <8 x bfloat> @bitcast_v8f16_to_v8bf16_scalar(<8 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f16_to_v8bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s23 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: .LBB101_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: .LBB101_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB101_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB101_2 +; +; VI-LABEL: bitcast_v8f16_to_v8bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB101_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB101_4 +; VI-NEXT: .LBB101_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_add_f16_e32 v4, s16, v0 +; VI-NEXT: v_add_f16_sdwa v5, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v6, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v7, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v0, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v0 +; VI-NEXT: v_or_b32_e32 v2, v2, v7 +; VI-NEXT: v_or_b32_e32 v1, v1, v6 +; VI-NEXT: v_or_b32_e32 v0, v4, v5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB101_3: +; VI-NEXT: s_branch .LBB101_2 +; VI-NEXT: .LBB101_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v8bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB101_4 +; GFX9-NEXT: .LBB101_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB101_3: +; GFX9-NEXT: s_branch .LBB101_2 +; GFX9-NEXT: .LBB101_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v8bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB101_4 +; GFX11-NEXT: .LBB101_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB101_3: +; GFX11-NEXT: s_branch .LBB101_2 +; GFX11-NEXT: .LBB101_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v8bf16_to_v8f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v7 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB51_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB51_4 -; GCN-NEXT: .LBB51_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB51_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB51_2 -; GCN-NEXT: .LBB51_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8bf16_to_v8f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB102_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB102_4 +; SI-NEXT: .LBB102_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB102_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB102_2 +; SI-NEXT: .LBB102_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8bf16_to_v8f16: ; VI: ; %bb.0: @@ -10814,7 +21385,7 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB51_2 +; VI-NEXT: s_cbranch_execz .LBB102_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -10889,7 +21460,7 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v2, v2, v6, 16 ; VI-NEXT: v_alignbit_b32 v1, v1, v5, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; VI-NEXT: .LBB51_2: ; %end +; VI-NEXT: .LBB102_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -10900,7 +21471,7 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB51_2 +; GFX9-NEXT: s_cbranch_execz .LBB102_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -10964,7 +21535,7 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v6, v2, s6 ; GFX9-NEXT: v_perm_b32 v1, v5, v1, s6 ; GFX9-NEXT: v_perm_b32 v0, v4, v0, s6 -; GFX9-NEXT: .LBB51_2: ; %end +; GFX9-NEXT: .LBB102_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10976,7 +21547,7 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11051,7 +21622,7 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v7, v3 ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v4 -; GFX11-TRUE16-NEXT: .LBB51_2: ; %end +; GFX11-TRUE16-NEXT: .LBB102_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11063,7 +21634,7 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11128,7 +21699,7 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB51_2: ; %end +; GFX11-FAKE16-NEXT: .LBB102_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -11148,115 +21719,503 @@ end: ret <8 x half> %phi } +define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8bf16_to_v8f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s23 +; SI-NEXT: s_cbranch_scc0 .LBB103_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_cbranch_execnz .LBB103_3 +; SI-NEXT: .LBB103_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: .LBB103_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB103_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB103_2 +; +; VI-LABEL: bitcast_v8bf16_to_v8f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB103_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB103_4 +; VI-NEXT: .LBB103_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; VI-NEXT: v_bfe_u32 v8, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; VI-NEXT: v_alignbit_b32 v2, v7, v2, 16 +; VI-NEXT: v_alignbit_b32 v1, v6, v1, 16 +; VI-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB103_3: +; VI-NEXT: s_branch .LBB103_2 +; VI-NEXT: .LBB103_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v8f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB103_4 +; GFX9-NEXT: .LBB103_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v6, vcc +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v3 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s19 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GFX9-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v3 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc +; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB103_3: +; GFX9-NEXT: s_branch .LBB103_2 +; GFX9-NEXT: .LBB103_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v8f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB103_4 +; GFX11-NEXT: .LBB103_2: ; %cmp.true +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s4 +; GFX11-NEXT: s_lshl_b32 s0, s1, 16 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v2 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s3, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4 +; GFX11-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v10, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v9 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3 +; GFX11-NEXT: v_bfe_u32 v3, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v5 +; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v11 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v4 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v9 +; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v6, 16, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB103_3: +; GFX11-NEXT: s_branch .LBB103_2 +; GFX11-NEXT: .LBB103_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f16_to_v16i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v6 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v9 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB52_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB52_4 -; GCN-NEXT: .LBB52_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB52_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 -; GCN-NEXT: v_or_b32_e32 v0, v17, v0 -; GCN-NEXT: v_or_b32_e32 v4, v16, v1 -; GCN-NEXT: v_or_b32_e32 v8, v19, v2 -; GCN-NEXT: v_or_b32_e32 v12, v18, v3 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_bfe_u32 v15, v14, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB52_2 -; GCN-NEXT: .LBB52_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 -; GCN-NEXT: v_or_b32_e32 v8, v1, v0 -; GCN-NEXT: v_or_b32_e32 v12, v2, v9 -; GCN-NEXT: v_or_b32_e32 v0, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v5, v10 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_bfe_u32 v15, v14, 8, 8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f16_to_v16i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB104_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB104_4 +; SI-NEXT: .LBB104_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB104_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v8, v20, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_or_b32_e32 v4, v16, v1 +; SI-NEXT: v_or_b32_e32 v12, v19, v5 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB104_2 +; SI-NEXT: .LBB104_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v8, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_or_b32_e32 v12, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v2, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v16i8: ; VI: ; %bb.0: @@ -11280,7 +22239,7 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB52_2 +; VI-NEXT: s_cbranch_execz .LBB104_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 @@ -11290,9 +22249,9 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB52_2: ; %Flow +; VI-NEXT: .LBB104_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB52_4 +; VI-NEXT: s_cbranch_execz .LBB104_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 0x200 ; VI-NEXT: v_add_f16_sdwa v6, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -11319,7 +22278,7 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; VI-NEXT: .LBB52_4: ; %end +; VI-NEXT: .LBB104_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v18 ; VI-NEXT: v_mov_b32_e32 v4, v19 @@ -11349,7 +22308,7 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB52_2 +; GFX9-NEXT: s_cbranch_execz .LBB104_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -11363,9 +22322,9 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB52_2: ; %Flow +; GFX9-NEXT: .LBB104_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB52_4 +; GFX9-NEXT: s_cbranch_execz .LBB104_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] @@ -11384,7 +22343,7 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB52_4: ; %end +; GFX9-NEXT: .LBB104_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v18 ; GFX9-NEXT: v_mov_b32_e32 v4, v19 @@ -11408,7 +22367,7 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB104_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 @@ -11418,9 +22377,9 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[16:17] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB52_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB104_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB104_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] @@ -11436,7 +22395,7 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB52_4: ; %end +; GFX11-TRUE16-NEXT: .LBB104_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -11449,48 +22408,450 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v17.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v8f16_to_v16i8: +; GFX11-FAKE16-LABEL: bitcast_v8f16_to_v16i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB104_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-FAKE16-NEXT: .LBB104_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB104_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-FAKE16-NEXT: .LBB104_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v16 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v17 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + +define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f16_to_v16i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s22 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB105_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v8, v20, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_or_b32_e32 v4, v16, v1 +; SI-NEXT: v_or_b32_e32 v12, v19, v5 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: s_cbranch_execnz .LBB105_3 +; SI-NEXT: .LBB105_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v8, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_or_b32_e32 v12, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v2, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: .LBB105_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB105_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB105_2 +; +; VI-LABEL: bitcast_v8f16_to_v16i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB105_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s19, 24 +; VI-NEXT: s_lshr_b32 s20, s19, 16 +; VI-NEXT: s_lshr_b32 s11, s19, 8 +; VI-NEXT: s_lshr_b32 s21, s18, 16 +; VI-NEXT: s_lshr_b32 s13, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s22, s17, 16 +; VI-NEXT: s_lshr_b32 s10, s17, 8 +; VI-NEXT: s_lshr_b32 s23, s16, 16 +; VI-NEXT: s_lshr_b32 s12, s16, 8 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB105_4 +; VI-NEXT: .LBB105_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x200 +; VI-NEXT: v_add_f16_e32 v6, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; VI-NEXT: v_add_f16_e32 v17, s17, v1 +; VI-NEXT: v_add_f16_e32 v2, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v19, v17, v0 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f16_e32 v0, s16, v1 +; VI-NEXT: v_add_f16_e32 v14, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v18, v0, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; VI-NEXT: v_add_f16_e32 v16, s19, v1 +; VI-NEXT: v_add_f16_e32 v10, s4, v1 +; VI-NEXT: v_or_b32_e32 v21, v16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; VI-NEXT: v_add_f16_e32 v8, s18, v1 +; VI-NEXT: v_or_b32_e32 v20, v8, v3 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[20:21] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v21 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; VI-NEXT: s_branch .LBB105_5 +; VI-NEXT: .LBB105_3: +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr23 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr21 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB105_2 +; VI-NEXT: .LBB105_4: +; VI-NEXT: v_mov_b32_e32 v2, s23 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v10, s21 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v17, s17 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v15, s14 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v9, s13 +; VI-NEXT: v_mov_b32_e32 v13, s11 +; VI-NEXT: v_mov_b32_e32 v1, s12 +; VI-NEXT: v_mov_b32_e32 v5, s10 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB105_5: ; %end +; VI-NEXT: v_mov_b32_e32 v4, v17 +; VI-NEXT: v_mov_b32_e32 v12, v16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v16i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s19, 24 +; GFX9-NEXT: s_lshr_b32 s11, s19, 16 +; GFX9-NEXT: s_lshr_b32 s13, s19, 8 +; GFX9-NEXT: s_lshr_b32 s12, s18, 16 +; GFX9-NEXT: s_lshr_b32 s14, s18, 8 +; GFX9-NEXT: s_lshr_b32 s15, s17, 24 +; GFX9-NEXT: s_lshr_b32 s20, s17, 16 +; GFX9-NEXT: s_lshr_b32 s22, s17, 8 +; GFX9-NEXT: s_lshr_b32 s21, s16, 16 +; GFX9-NEXT: s_lshr_b32 s23, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB105_4 +; GFX9-NEXT: .LBB105_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v19, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: s_branch .LBB105_5 +; GFX9-NEXT: .LBB105_3: +; GFX9-NEXT: ; implicit-def: $sgpr23 +; GFX9-NEXT: ; implicit-def: $sgpr21 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr22 +; GFX9-NEXT: ; implicit-def: $sgpr20 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB105_2 +; GFX9-NEXT: .LBB105_4: +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v16, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s21 +; GFX9-NEXT: v_mov_b32_e32 v5, s22 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: v_mov_b32_e32 v10, s12 +; GFX9-NEXT: v_mov_b32_e32 v13, s13 +; GFX9-NEXT: v_mov_b32_e32 v14, s11 +; GFX9-NEXT: v_mov_b32_e32 v15, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB105_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v8f16_to_v16i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-TRUE16-NEXT: .LBB105_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB105_5 +; GFX11-TRUE16-NEXT: .LBB105_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB105_2 +; GFX11-TRUE16-NEXT: .LBB105_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: .LBB105_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v8f16_to_v16i8_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr1 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr9 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr14 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 -; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB105_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-FAKE16-NEXT: .LBB105_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, s0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 @@ -11505,8 +22866,33 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB52_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_branch .LBB105_5 +; GFX11-FAKE16-NEXT: .LBB105_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr18 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: s_branch .LBB105_2 +; GFX11-FAKE16-NEXT: .LBB105_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v19, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v17, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s18 :: v_dual_mov_b32 v2, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s14 :: v_dual_mov_b32 v10, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v14, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, s12 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s9 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB105_5: ; %end ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v16 @@ -11530,122 +22916,122 @@ end: } define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i8_to_v8f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v0, v16 -; GCN-NEXT: v_or_b32_e32 v1, v1, v18 -; GCN-NEXT: v_or_b32_e32 v2, v2, v19 -; GCN-NEXT: v_or_b32_e32 v3, v3, v20 -; GCN-NEXT: v_or_b32_e32 v4, v4, v21 -; GCN-NEXT: v_or_b32_e32 v5, v5, v22 -; GCN-NEXT: v_or_b32_e32 v6, v6, v23 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: .LBB53_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v14 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v17 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v0, v15, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_or_b32_e32 v3, v22, v3 -; GCN-NEXT: v_or_b32_e32 v5, v21, v5 -; GCN-NEXT: v_or_b32_e32 v6, v20, v6 -; GCN-NEXT: v_or_b32_e32 v4, v19, v4 -; GCN-NEXT: v_or_b32_e32 v2, v18, v2 -; GCN-NEXT: v_or_b32_e32 v7, v16, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x300, v0 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v8 -; GCN-NEXT: .LBB53_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v11 -; GCN-NEXT: v_mov_b32_e32 v4, v9 -; GCN-NEXT: v_mov_b32_e32 v6, v13 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i8_to_v8f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v21 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: .LBB106_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: .LBB106_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v9 +; SI-NEXT: v_mov_b32_e32 v6, v13 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v8f16: ; VI: ; %bb.0: @@ -11664,14 +23050,14 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: s_cbranch_execnz .LBB106_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB53_4 -; VI-NEXT: .LBB53_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB106_4 +; VI-NEXT: .LBB106_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB53_3: ; %cmp.false +; VI-NEXT: .LBB106_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -11701,8 +23087,8 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB53_2 -; VI-NEXT: .LBB53_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB106_2 +; VI-NEXT: .LBB106_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v17 ; VI-NEXT: v_add_u16_e32 v1, 3, v18 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -11752,14 +23138,14 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: s_cbranch_execnz .LBB106_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB53_4 -; GFX9-NEXT: .LBB53_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB106_4 +; GFX9-NEXT: .LBB106_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB53_3: ; %cmp.false +; GFX9-NEXT: .LBB106_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -11789,8 +23175,8 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB53_2 -; GFX9-NEXT: .LBB53_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB106_2 +; GFX9-NEXT: .LBB106_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -11844,14 +23230,14 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_4 -; GFX11-TRUE16-NEXT: .LBB53_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_4 +; GFX11-TRUE16-NEXT: .LBB106_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB53_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l @@ -11903,8 +23289,8 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB53_2 -; GFX11-TRUE16-NEXT: .LBB53_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 +; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 @@ -11974,14 +23360,14 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB106_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_4 -; GFX11-FAKE16-NEXT: .LBB53_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB106_4 +; GFX11-FAKE16-NEXT: .LBB106_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB53_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -12028,8 +23414,8 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB53_2 -; GFX11-FAKE16-NEXT: .LBB53_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB106_2 +; GFX11-FAKE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v17, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v18, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -12094,110 +23480,555 @@ end: ret <8 x half> %phi } +define inreg <8 x half> @bitcast_v16i8_to_v8f16_scalar(<16 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i8_to_v8f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_cbranch_scc0 .LBB107_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_cbranch_execnz .LBB107_3 +; SI-NEXT: .LBB107_2: ; %cmp.true +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s28, 0xff +; SI-NEXT: s_lshl_b32 s6, s29, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s27, 8 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s24, 0xff +; SI-NEXT: s_lshl_b32 s8, s25, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s22, 0xff +; SI-NEXT: s_lshl_b32 s9, s23, 8 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s20, 0xff +; SI-NEXT: s_lshl_b32 s10, s21, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s18, 0xff +; SI-NEXT: s_lshl_b32 s11, s19, 8 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s16, 0xff +; SI-NEXT: s_lshl_b32 s12, s17, 8 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: .LBB107_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB107_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB107_2 +; +; VI-LABEL: bitcast_v16i8_to_v8f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_mov_b32_e32 v5, v1 +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB107_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s7, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB107_3 +; VI-NEXT: .LBB107_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s19, 24 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v4 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s28, 0xff +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB107_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB107_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_branch .LBB107_2 +; +; GFX9-LABEL: bitcast_v16i8_to_v8f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB107_3 +; GFX9-NEXT: .LBB107_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: .LBB107_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB107_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB107_2 +; +; GFX11-LABEL: bitcast_v16i8_to_v8f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s21, 8 +; GFX11-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-NEXT: s_or_b32 s9, s9, s10 +; GFX11-NEXT: s_or_b32 s10, s11, s12 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB107_3 +; GFX11-NEXT: .LBB107_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s21, 8 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s23, 8 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s25, 8 +; GFX11-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s7, s6 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s6, s0, s1 +; GFX11-NEXT: s_or_b32 s7, s2, s3 +; GFX11-NEXT: .LBB107_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB107_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB107_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v8bf16_to_v16i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB54_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB54_4 -; GCN-NEXT: .LBB54_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB54_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v20 -; GCN-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; GCN-NEXT: v_alignbit_b32 v4, v6, v16, 16 -; GCN-NEXT: v_alignbit_b32 v8, v1, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v14, v19, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB54_2 -; GCN-NEXT: .LBB54_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v17 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v7 -; GCN-NEXT: v_alignbit_b32 v8, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v12, v14, v2, 16 -; GCN-NEXT: v_alignbit_b32 v0, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v4, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8bf16_to_v16i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB108_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB108_4 +; SI-NEXT: .LBB108_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB108_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_alignbit_b32 v0, v0, v20, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v18, 16 +; SI-NEXT: v_alignbit_b32 v8, v5, v23, 16 +; SI-NEXT: v_alignbit_b32 v12, v14, v21, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB108_2 +; SI-NEXT: .LBB108_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8bf16_to_v16i8: ; VI: ; %bb.0: @@ -12221,7 +24052,7 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB54_2 +; VI-NEXT: s_cbranch_execz .LBB108_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -12235,9 +24066,9 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB54_2: ; %Flow +; VI-NEXT: .LBB108_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB54_4 +; VI-NEXT: s_cbranch_execz .LBB108_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -12324,7 +24155,7 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB54_4: ; %end +; VI-NEXT: .LBB108_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v18 ; VI-NEXT: v_mov_b32_e32 v4, v19 @@ -12354,7 +24185,7 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB54_2 +; GFX9-NEXT: s_cbranch_execz .LBB108_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -12368,9 +24199,9 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB54_2: ; %Flow +; GFX9-NEXT: .LBB108_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB54_4 +; GFX9-NEXT: s_cbranch_execz .LBB108_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -12450,7 +24281,7 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX9-NEXT: .LBB54_4: ; %end +; GFX9-NEXT: .LBB108_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v18 ; GFX9-NEXT: v_mov_b32_e32 v4, v19 @@ -12481,7 +24312,7 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v11 @@ -12497,9 +24328,9 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v11.h -; GFX11-TRUE16-NEXT: .LBB54_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -12578,7 +24409,7 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[2:3] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; GFX11-TRUE16-NEXT: .LBB54_4: ; %end +; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h @@ -12612,7 +24443,7 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -12626,9 +24457,9 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB54_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB108_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v19 @@ -12717,7 +24548,7 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v7 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX11-FAKE16-NEXT: .LBB54_4: ; %end +; GFX11-FAKE16-NEXT: .LBB108_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 @@ -12741,139 +24572,839 @@ end: ret <16 x i8> %phi } +define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8bf16_to_v16i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s22 +; SI-NEXT: s_cbranch_scc0 .LBB109_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_alignbit_b32 v0, v0, v19, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v17, 16 +; SI-NEXT: v_alignbit_b32 v8, v5, v23, 16 +; SI-NEXT: v_alignbit_b32 v12, v14, v21, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: s_cbranch_execnz .LBB109_3 +; SI-NEXT: .LBB109_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 +; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB109_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB109_2 +; +; VI-LABEL: bitcast_v8bf16_to_v16i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB109_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s19, 24 +; VI-NEXT: s_lshr_b32 s11, s19, 16 +; VI-NEXT: s_lshr_b32 s13, s19, 8 +; VI-NEXT: s_lshr_b32 s12, s18, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s20, s17, 16 +; VI-NEXT: s_lshr_b32 s22, s17, 8 +; VI-NEXT: s_lshr_b32 s21, s16, 16 +; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB109_4 +; VI-NEXT: .LBB109_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v19, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v18, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v17, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v16, v0, v1, 16 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: s_branch .LBB109_5 +; VI-NEXT: .LBB109_3: +; VI-NEXT: ; implicit-def: $sgpr23 +; VI-NEXT: ; implicit-def: $sgpr21 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB109_2 +; VI-NEXT: .LBB109_4: +; VI-NEXT: v_mov_b32_e32 v18, s16 +; VI-NEXT: v_mov_b32_e32 v19, s17 +; VI-NEXT: v_mov_b32_e32 v16, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v1, s23 +; VI-NEXT: v_mov_b32_e32 v2, s21 +; VI-NEXT: v_mov_b32_e32 v5, s22 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v9, s14 +; VI-NEXT: v_mov_b32_e32 v10, s12 +; VI-NEXT: v_mov_b32_e32 v13, s13 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB109_5: ; %end +; VI-NEXT: v_mov_b32_e32 v0, v18 +; VI-NEXT: v_mov_b32_e32 v4, v19 +; VI-NEXT: v_mov_b32_e32 v8, v16 +; VI-NEXT: v_mov_b32_e32 v12, v17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v16i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s12, s19, 24 +; GFX9-NEXT: s_lshr_b32 s23, s19, 16 +; GFX9-NEXT: s_lshr_b32 s15, s19, 8 +; GFX9-NEXT: s_lshr_b32 s21, s18, 16 +; GFX9-NEXT: s_lshr_b32 s20, s18, 8 +; GFX9-NEXT: s_lshr_b32 s10, s17, 24 +; GFX9-NEXT: s_lshr_b32 s22, s17, 16 +; GFX9-NEXT: s_lshr_b32 s11, s17, 8 +; GFX9-NEXT: s_lshr_b32 s14, s16, 16 +; GFX9-NEXT: s_lshr_b32 s13, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB109_4 +; GFX9-NEXT: .LBB109_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s19 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v3 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v3 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v16 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_lshl_or_b32 v10, v14, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v3 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v3 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: s_branch .LBB109_5 +; GFX9-NEXT: .LBB109_3: +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr22 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr20 +; GFX9-NEXT: ; implicit-def: $sgpr21 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr23 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: s_branch .LBB109_2 +; GFX9-NEXT: .LBB109_4: +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v14, s23 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v17, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v10, s21 +; GFX9-NEXT: v_mov_b32_e32 v9, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s12 +; GFX9-NEXT: v_mov_b32_e32 v13, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB109_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v4, v17 +; GFX9-NEXT: v_mov_b32_e32 v12, v16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v16i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB109_4 +; GFX11-TRUE16-NEXT: .LBB109_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s1, 0, s3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v7, v9 :: v_dual_and_b32 v1, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v11, v8 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v12, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v11, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v6, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v7, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v14, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-TRUE16-NEXT: s_branch .LBB109_5 +; GFX11-TRUE16-NEXT: .LBB109_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB109_2 +; GFX11-TRUE16-NEXT: .LBB109_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s4 +; GFX11-TRUE16-NEXT: .LBB109_5: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v16i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB109_4 +; GFX11-FAKE16-NEXT: .LBB109_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s3, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v14, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: s_branch .LBB109_5 +; GFX11-FAKE16-NEXT: .LBB109_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr18 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: s_branch .LBB109_2 +; GFX11-FAKE16-NEXT: .LBB109_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s3 :: v_dual_mov_b32 v9, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v1, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v7, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s13 :: v_dual_mov_b32 v5, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB109_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v17 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v16 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i8_to_v8bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v15 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v3, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v11, v1, v0 -; GCN-NEXT: v_or_b32_e32 v15, v18, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v19, v4 -; GCN-NEXT: v_or_b32_e32 v13, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v20, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; GCN-NEXT: v_or_b32_e32 v7, v21, v9 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: .LBB55_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v12 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v14 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v23, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GCN-NEXT: v_or_b32_e32 v4, v22, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v3 -; GCN-NEXT: v_or_b32_e32 v3, v21, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v7 -; GCN-NEXT: v_or_b32_e32 v7, v20, v8 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v6, v19, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_or_b32_e32 v1, v18, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v5 -; GCN-NEXT: v_or_b32_e32 v4, v6, v4 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v2 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v1 -; GCN-NEXT: .LBB55_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v11 -; GCN-NEXT: v_mov_b32_e32 v1, v15 -; GCN-NEXT: v_mov_b32_e32 v2, v17 -; GCN-NEXT: v_mov_b32_e32 v4, v13 -; GCN-NEXT: v_mov_b32_e32 v6, v16 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i8_to_v8bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v15 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v20, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; SI-NEXT: v_or_b32_e32 v11, v4, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v5, v21, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v7, v13, v2 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: .LBB110_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v12 +; SI-NEXT: .LBB110_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v15 +; SI-NEXT: v_mov_b32_e32 v4, v11 +; SI-NEXT: v_mov_b32_e32 v6, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v8bf16: ; VI: ; %bb.0: @@ -12892,14 +25423,14 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: s_cbranch_execnz .LBB110_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB55_4 -; VI-NEXT: .LBB55_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB110_4 +; VI-NEXT: .LBB110_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB55_3: ; %cmp.false +; VI-NEXT: .LBB110_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -12929,8 +25460,8 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB55_2 -; VI-NEXT: .LBB55_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB110_2 +; VI-NEXT: .LBB110_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v17 ; VI-NEXT: v_add_u16_e32 v1, 3, v18 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -12980,14 +25511,14 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: s_cbranch_execnz .LBB110_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB55_4 -; GFX9-NEXT: .LBB55_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB110_4 +; GFX9-NEXT: .LBB110_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB55_3: ; %cmp.false +; GFX9-NEXT: .LBB110_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -13017,8 +25548,8 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB55_2 -; GFX9-NEXT: .LBB55_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB110_2 +; GFX9-NEXT: .LBB110_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -13072,14 +25603,14 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_4 -; GFX11-TRUE16-NEXT: .LBB55_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_4 +; GFX11-TRUE16-NEXT: .LBB110_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB55_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l @@ -13131,8 +25662,8 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB55_2 -; GFX11-TRUE16-NEXT: .LBB55_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 +; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 @@ -13202,14 +25733,14 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB110_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_4 -; GFX11-FAKE16-NEXT: .LBB55_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB110_4 +; GFX11-FAKE16-NEXT: .LBB110_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB55_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -13256,8 +25787,8 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB55_2 -; GFX11-FAKE16-NEXT: .LBB55_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB110_2 +; GFX11-FAKE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v17, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v18, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -13321,3 +25852,468 @@ end: %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <8 x bfloat> %phi } + +define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i8_to_v8bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s11, v0 +; SI-NEXT: s_cbranch_scc0 .LBB111_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s17, 24 +; SI-NEXT: s_or_b32 s6, s5, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_or_b32 s10, s5, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s25, 24 +; SI-NEXT: s_or_b32 s12, s5, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_or_b32 s13, s5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s14, s4, 16 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s9, 24 +; SI-NEXT: s_or_b32 s15, s5, s4 +; SI-NEXT: s_cbranch_execnz .LBB111_3 +; SI-NEXT: .LBB111_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s11, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s9, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s22, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s8, s6, 0x3000000 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s7, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s9, s18, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s10, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s13, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s5, 16 +; SI-NEXT: s_and_b32 s15, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s4, 16 +; SI-NEXT: .LBB111_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB111_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: s_branch .LBB111_2 +; +; VI-LABEL: bitcast_v16i8_to_v8bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_mov_b32_e32 v5, v1 +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB111_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s7, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB111_3 +; VI-NEXT: .LBB111_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s19, 24 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v4 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s28, 0xff +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB111_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB111_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_branch .LBB111_2 +; +; GFX9-LABEL: bitcast_v16i8_to_v8bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB111_3 +; GFX9-NEXT: .LBB111_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: .LBB111_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB111_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB111_2 +; +; GFX11-LABEL: bitcast_v16i8_to_v8bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s21, 8 +; GFX11-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-NEXT: s_or_b32 s9, s9, s10 +; GFX11-NEXT: s_or_b32 s10, s11, s12 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX11-NEXT: .LBB111_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s21, 8 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s23, 8 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s25, 8 +; GFX11-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s7, s6 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s6, s0, s1 +; GFX11-NEXT: s_or_b32 s7, s2, s3 +; GFX11-NEXT: .LBB111_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB111_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB111_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll index 25bf7b2255e5c..6e2ae809d5030 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll @@ -1,28 +1,28 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <5 x float> @bitcast_v5i32_to_v5f32(<5 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v5i32_to_v5f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5i32_to_v5f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i32_to_v5f32: ; VI: ; %bb.0: @@ -92,24 +92,133 @@ end: ret <5 x float> %phi } +define inreg <5 x float> @bitcast_v5i32_to_v5f32_scalar(<5 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i32_to_v5f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s21, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v5i32_to_v5f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v5i32_to_v5f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v5i32_to_v5f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s16 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i32> %a, splat (i32 3) + %a2 = bitcast <5 x i32> %a1 to <5 x float> + br label %end + +cmp.false: + %a3 = bitcast <5 x i32> %a to <5 x float> + br label %end + +end: + %phi = phi <5 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x float> %phi +} + define <5 x i32> @bitcast_v5f32_to_v5i32(<5 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v5f32_to_v5i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5f32_to_v5i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f32_to_v5i32: ; VI: ; %bb.0: @@ -177,50 +286,164 @@ end: ret <5 x i32> %phi } +define inreg <5 x i32> @bitcast_v5f32_to_v5i32_scalar(<5 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f32_to_v5i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s21, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f32_to_v5i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f32_to_v5i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f32_to_v5i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <5 x float> %a1 to <5 x i32> + br label %end + +cmp.false: + %a3 = bitcast <5 x float> %a to <5 x i32> + br label %end + +end: + %phi = phi <5 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i32> %phi +} + define <10 x i16> @bitcast_v5i32_to_v10i16(<5 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v5i32_to_v10i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB2_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5i32_to_v10i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_4 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB4_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: .LBB4_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i32_to_v10i16: ; VI: ; %bb.0: @@ -290,76 +513,209 @@ end: ret <10 x i16> %phi } +define inreg <10 x i16> @bitcast_v5i32_to_v10i16_scalar(<5 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i32_to_v10i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s21, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s4, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v5i32_to_v10i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v5i32_to_v10i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v5i32_to_v10i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s16 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i32> %a, splat (i32 3) + %a2 = bitcast <5 x i32> %a1 to <10 x i16> + br label %end + +cmp.false: + %a3 = bitcast <5 x i32> %a to <10 x i16> + br label %end + +end: + %phi = phi <10 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i16> %phi +} + define <5 x i32> @bitcast_v10i16_to_v5i32(<10 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i16_to_v5i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v13, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v0, v0, v10 -; GCN-NEXT: v_or_b32_e32 v1, v1, v14 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v11 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v10, v0 -; GCN-NEXT: v_or_b32_e32 v1, v14, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10i16_to_v5i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_mov_b32_e32 v12, v2 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v10, v2 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i16_to_v5i32: ; VI: ; %bb.0: @@ -368,7 +724,7 @@ define <5 x i32> @bitcast_v10i16_to_v5i32(<10 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v6, 3 ; VI-NEXT: v_add_u16_e32 v5, 3, v4 @@ -386,7 +742,7 @@ define <5 x i32> @bitcast_v10i16_to_v5i32(<10 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v5, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v5, v0 -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -441,81 +797,249 @@ end: ret <5 x i32> %phi } +define inreg <5 x i32> @bitcast_v10i16_to_v5i32_scalar(<10 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i16_to_v5i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v10i16_to_v5i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v10i16_to_v5i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i16_to_v5i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: s_branch .LBB7_2 +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i16> %a, splat (i16 3) + %a2 = bitcast <10 x i16> %a1 to <5 x i32> + br label %end + +cmp.false: + %a3 = bitcast <10 x i16> %a to <5 x i32> + br label %end + +end: + %phi = phi <5 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i32> %phi +} + define <10 x half> @bitcast_v5i32_to_v10f16(<5 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v5i32_to_v10f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v4 -; GCN-NEXT: v_mov_b32_e32 v13, v3 -; GCN-NEXT: v_mov_b32_e32 v12, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NEXT: v_mov_b32_e32 v10, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_4 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB4_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: .LBB4_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5i32_to_v10f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v4 +; SI-NEXT: v_mov_b32_e32 v13, v3 +; SI-NEXT: v_mov_b32_e32 v12, v2 +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_4 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB8_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: .LBB8_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i32_to_v10f16: ; VI: ; %bb.0: @@ -585,97 +1109,242 @@ end: ret <10 x half> %phi } +define inreg <10 x half> @bitcast_v5i32_to_v10f16_scalar(<5 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i32_to_v10f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s21, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v5i32_to_v10f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v5i32_to_v10f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v5i32_to_v10f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s16 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i32> %a, splat (i32 3) + %a2 = bitcast <5 x i32> %a1 to <10 x half> + br label %end + +cmp.false: + %a3 = bitcast <5 x i32> %a to <10 x half> + br label %end + +end: + %phi = phi <10 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x half> %phi +} + define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f16_to_v5i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v8 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_4 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB5_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v14, v0 -; GCN-NEXT: v_or_b32_e32 v1, v12, v1 -; GCN-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-NEXT: v_or_b32_e32 v3, v6, v3 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: .LBB5_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v8, v4 -; GCN-NEXT: v_or_b32_e32 v3, v6, v9 -; GCN-NEXT: v_or_b32_e32 v4, v5, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f16_to_v5i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f16_to_v5i32: ; VI: ; %bb.0: @@ -684,7 +1353,7 @@ define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 0x200 ; VI-NEXT: v_add_f16_sdwa v6, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -702,7 +1371,7 @@ define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v6 ; VI-NEXT: v_or_b32_e32 v0, v0, v5 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -758,50 +1427,236 @@ end: ret <5 x i32> %phi } +define inreg <5 x i32> @bitcast_v10f16_to_v5i32_scalar(<10 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f16_to_v5i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s24 +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v10f16_to_v5i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v5, v1 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_add_f16_sdwa v5, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f16_to_v5i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f16_to_v5i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x half> %a, splat (half 0xH0200) + %a2 = bitcast <10 x half> %a1 to <5 x i32> + br label %end + +cmp.false: + %a3 = bitcast <10 x half> %a to <5 x i32> + br label %end + +end: + %phi = phi <5 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i32> %phi +} + define <10 x i16> @bitcast_v5f32_to_v10i16(<5 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v5f32_to_v10i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_4 -; GCN-NEXT: .LBB6_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB6_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: .LBB6_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5f32_to_v10i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_4 +; SI-NEXT: .LBB12_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB12_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: .LBB12_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f32_to_v10i16: ; VI: ; %bb.0: @@ -869,76 +1724,217 @@ end: ret <10 x i16> %phi } +define inreg <10 x i16> @bitcast_v5f32_to_v10i16_scalar(<5 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f32_to_v10i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s21, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_4 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_branch .LBB13_2 +; SI-NEXT: .LBB13_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f32_to_v10i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_4 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_3: +; VI-NEXT: s_branch .LBB13_2 +; VI-NEXT: .LBB13_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f32_to_v10i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_4 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_3: +; GFX9-NEXT: s_branch .LBB13_2 +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f32_to_v10i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_4 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_3: +; GFX11-NEXT: s_branch .LBB13_2 +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <5 x float> %a1 to <10 x i16> + br label %end + +cmp.false: + %a3 = bitcast <5 x float> %a to <10 x i16> + br label %end + +end: + %phi = phi <10 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i16> %phi +} + define <5 x float> @bitcast_v10i16_to_v5f32(<10 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i16_to_v5f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v13, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v0, v0, v10 -; GCN-NEXT: v_or_b32_e32 v1, v1, v14 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v11 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v10, v0 -; GCN-NEXT: v_or_b32_e32 v1, v14, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10i16_to_v5f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_mov_b32_e32 v12, v2 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v10, v2 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i16_to_v5f32: ; VI: ; %bb.0: @@ -947,7 +1943,7 @@ define <5 x float> @bitcast_v10i16_to_v5f32(<10 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v6, 3 ; VI-NEXT: v_add_u16_e32 v5, 3, v4 @@ -965,7 +1961,7 @@ define <5 x float> @bitcast_v10i16_to_v5f32(<10 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v5, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v5, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1020,81 +2016,249 @@ end: ret <5 x float> %phi } +define inreg <5 x float> @bitcast_v10i16_to_v5f32_scalar(<10 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i16_to_v5f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v10i16_to_v5f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v10i16_to_v5f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i16_to_v5f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i16> %a, splat (i16 3) + %a2 = bitcast <10 x i16> %a1 to <5 x float> + br label %end + +cmp.false: + %a3 = bitcast <10 x i16> %a to <5 x float> + br label %end + +end: + %phi = phi <5 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x float> %phi +} + define <10 x half> @bitcast_v5f32_to_v10f16(<5 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v5f32_to_v10f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v4 -; GCN-NEXT: v_mov_b32_e32 v13, v3 -; GCN-NEXT: v_mov_b32_e32 v12, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NEXT: v_mov_b32_e32 v10, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5f32_to_v10f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v4 +; SI-NEXT: v_mov_b32_e32 v13, v3 +; SI-NEXT: v_mov_b32_e32 v12, v2 +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f32_to_v10f16: ; VI: ; %bb.0: @@ -1162,97 +2326,252 @@ end: ret <10 x half> %phi } +define inreg <10 x half> @bitcast_v5f32_to_v10f16_scalar(<5 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f32_to_v10f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s21, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v5f32_to_v10f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_4 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_3: +; VI-NEXT: s_branch .LBB17_2 +; VI-NEXT: .LBB17_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f32_to_v10f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_4 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_3: +; GFX9-NEXT: s_branch .LBB17_2 +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f32_to_v10f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_4 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_3: +; GFX11-NEXT: s_branch .LBB17_2 +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <5 x float> %a1 to <10 x half> + br label %end + +cmp.false: + %a3 = bitcast <5 x float> %a to <10 x half> + br label %end + +end: + %phi = phi <10 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x half> %phi +} + define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f16_to_v5f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v8 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v14, v0 -; GCN-NEXT: v_or_b32_e32 v1, v12, v1 -; GCN-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-NEXT: v_or_b32_e32 v3, v6, v3 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v8, v4 -; GCN-NEXT: v_or_b32_e32 v3, v6, v9 -; GCN-NEXT: v_or_b32_e32 v4, v5, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f16_to_v5f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f16_to_v5f32: ; VI: ; %bb.0: @@ -1261,7 +2580,7 @@ define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 0x200 ; VI-NEXT: v_add_f16_sdwa v6, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1279,7 +2598,7 @@ define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v6 ; VI-NEXT: v_or_b32_e32 v0, v0, v5 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1335,86 +2654,272 @@ end: ret <5 x float> %phi } +define inreg <5 x float> @bitcast_v10f16_to_v5f32_scalar(<10 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f16_to_v5f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s24 +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v10f16_to_v5f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v5, v1 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_add_f16_sdwa v5, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f16_to_v5f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f16_to_v5f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x half> %a, splat (half 0xH0200) + %a2 = bitcast <10 x half> %a1 to <5 x float> + br label %end + +cmp.false: + %a3 = bitcast <10 x half> %a to <5 x float> + br label %end + +end: + %phi = phi <5 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x float> %phi +} + define <10 x half> @bitcast_v10i16_to_v10f16(<10 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i16_to_v10f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v19, v9 -; GCN-NEXT: v_mov_b32_e32 v18, v8 -; GCN-NEXT: v_mov_b32_e32 v17, v7 -; GCN-NEXT: v_mov_b32_e32 v16, v6 -; GCN-NEXT: v_mov_b32_e32 v15, v5 -; GCN-NEXT: v_mov_b32_e32 v14, v4 -; GCN-NEXT: v_mov_b32_e32 v13, v3 -; GCN-NEXT: v_mov_b32_e32 v12, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NEXT: v_mov_b32_e32 v20, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10i16_to_v10f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v19, v9 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v17, v7 +; SI-NEXT: v_mov_b32_e32 v16, v6 +; SI-NEXT: v_mov_b32_e32 v15, v5 +; SI-NEXT: v_mov_b32_e32 v14, v4 +; SI-NEXT: v_mov_b32_e32 v13, v3 +; SI-NEXT: v_mov_b32_e32 v12, v2 +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v20, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i16_to_v10f16: ; VI: ; %bb.0: @@ -1423,7 +2928,7 @@ define <10 x half> @bitcast_v10i16_to_v10f16(<10 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 3 ; VI-NEXT: v_add_u16_sdwa v6, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1441,7 +2946,7 @@ define <10 x half> @bitcast_v10i16_to_v10f16(<10 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v8 ; VI-NEXT: v_or_b32_e32 v1, v1, v7 ; VI-NEXT: v_or_b32_e32 v0, v0, v6 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1496,71 +3001,237 @@ end: ret <10 x half> %phi } +define inreg <10 x half> @bitcast_v10i16_to_v10f16_scalar(<10 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i16_to_v10f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v10i16_to_v10f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v10i16_to_v10f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_4 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_3: +; GFX9-NEXT: s_branch .LBB21_2 +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i16_to_v10f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: s_branch .LBB21_2 +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i16> %a, splat (i16 3) + %a2 = bitcast <10 x i16> %a1 to <10 x half> + br label %end + +cmp.false: + %a3 = bitcast <10 x i16> %a to <10 x half> + br label %end + +end: + %phi = phi <10 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x half> %phi +} + define <10 x i16> @bitcast_v10f16_to_v10i16(<10 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f16_to_v10i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v10 -; GCN-NEXT: v_or_b32_e32 v6, v6, v11 -; GCN-NEXT: v_or_b32_e32 v2, v2, v12 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f16_to_v10i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f16_to_v10i16: ; VI: ; %bb.0: @@ -1569,7 +3240,7 @@ define <10 x i16> @bitcast_v10f16_to_v10i16(<10 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v6, 0x200 ; VI-NEXT: v_add_f16_e32 v5, 0x200, v0 @@ -1587,7 +3258,7 @@ define <10 x i16> @bitcast_v10f16_to_v10i16(<10 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v8, v2 ; VI-NEXT: v_or_b32_e32 v1, v7, v1 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1642,3 +3313,190 @@ end: %phi = phi <10 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <10 x i16> %phi } + +define inreg <10 x i16> @bitcast_v10f16_to_v10i16_scalar(<10 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f16_to_v10i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v10f16_to_v10i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v10, s4 +; VI-NEXT: v_add_f16_e32 v5, s16, v0 +; VI-NEXT: v_add_f16_sdwa v6, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v7, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v8, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v9, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: v_add_f16_sdwa v0, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v0 +; VI-NEXT: v_or_b32_e32 v3, v3, v9 +; VI-NEXT: v_or_b32_e32 v2, v2, v8 +; VI-NEXT: v_or_b32_e32 v1, v1, v7 +; VI-NEXT: v_or_b32_e32 v0, v5, v6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f16_to_v10i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f16_to_v10i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: s_branch .LBB23_2 +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x half> %a, splat (half 0xH0200) + %a2 = bitcast <10 x half> %a1 to <10 x i16> + br label %end + +cmp.false: + %a3 = bitcast <10 x half> %a to <10 x i16> + br label %end + +end: + %phi = phi <10 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i16> %phi +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FAKE16: {{.*}} +; GFX11-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll index 27d32fc05e428..7556d355a3844 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll @@ -1,37 +1,37 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define half @bitcast_i16_to_f16(i16 %a, i32 %b) { -; GCN-LABEL: bitcast_i16_to_f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB0_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB0_4 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB0_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: .LBB0_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i16_to_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB0_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB0_4 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB0_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: .LBB0_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i16_to_f16: ; VI: ; %bb.0: @@ -111,23 +111,126 @@ end: ret half %phi } +define inreg half @bitcast_i16_to_f16_scalar(i16 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i16_to_f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_and_b32 s6, s16, 0xffff +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_i16_to_f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_i16_to_f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-TRUE16-LABEL: bitcast_i16_to_f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s0 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-TRUE16-NEXT: .LBB1_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s1, s0, 3 +; GFX11-TRUE16-NEXT: .LBB1_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB1_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr1_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB1_2 +; +; GFX11-FAKE16-LABEL: bitcast_i16_to_f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB1_3: +; GFX11-FAKE16-NEXT: .LBB1_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i16 %a, 3 + %a2 = bitcast i16 %a1 to half + br label %end + +cmp.false: + %a3 = bitcast i16 %a to half + br label %end + +end: + %phi = phi half [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret half %phi +} + define i16 @bitcast_f16_to_i16(half %a, i32 %b) { -; GCN-LABEL: bitcast_f16_to_i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f16_to_i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f16_to_i16: ; VI: ; %bb.0: @@ -168,10 +271,10 @@ define i16 @bitcast_f16_to_i16(half %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB1_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 0x200, v0.l -; GFX11-TRUE16-NEXT: .LBB1_4: ; %end +; GFX11-TRUE16-NEXT: .LBB2_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l @@ -207,22 +310,127 @@ end: ret i16 %phi } +define inreg i16 @bitcast_f16_to_i16_scalar(half inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f16_to_i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_f16_to_i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f16_to_i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_add_f16_e32 v0, s16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_f16_to_i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-TRUE16-NEXT: .LBB3_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, 0x200, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB3_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB3_2 +; GFX11-TRUE16-NEXT: .LBB3_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_f16_to_i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-FAKE16-NEXT: .LBB3_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, 0x200, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB3_3: +; GFX11-FAKE16-NEXT: s_branch .LBB3_2 +; GFX11-FAKE16-NEXT: .LBB3_4: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd half %a, 0xH0200 + %a2 = bitcast half %a1 to i16 + br label %end + +cmp.false: + %a3 = bitcast half %a to i16 + br label %end + +end: + %phi = phi i16 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i16 %phi +} + define bfloat @bitcast_i16_to_bf16(i16 %a, i32 %b) { -; GCN-LABEL: bitcast_i16_to_bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i16_to_bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i16_to_bf16: ; VI: ; %bb.0: @@ -263,10 +471,10 @@ define bfloat @bitcast_i16_to_bf16(i16 %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB4_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v0.l, 3 -; GFX11-TRUE16-NEXT: .LBB2_4: ; %end +; GFX11-TRUE16-NEXT: .LBB4_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l @@ -302,33 +510,138 @@ end: ret bfloat %phi } +define inreg bfloat @bitcast_i16_to_bf16_scalar(i16 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i16_to_bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_and_b32 s6, s16, 0xffff +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s7, s6, 16 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_lshl_b32 s4, s6, 16 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_i16_to_bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_i16_to_bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-TRUE16-LABEL: bitcast_i16_to_bf16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s0 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-TRUE16-NEXT: .LBB5_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s1, s0, 3 +; GFX11-TRUE16-NEXT: .LBB5_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB5_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr1_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB5_2 +; +; GFX11-FAKE16-LABEL: bitcast_i16_to_bf16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB5_3: +; GFX11-FAKE16-NEXT: .LBB5_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i16 %a, 3 + %a2 = bitcast i16 %a1 to bfloat + br label %end + +cmp.false: + %a3 = bitcast i16 %a to bfloat + br label %end + +end: + %phi = phi bfloat [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret bfloat %phi +} + define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) { -; GCN-LABEL: bitcast_bf16_to_i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_bf16_to_i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_bf16_to_i16: ; VI: ; %bb.0: @@ -386,7 +699,7 @@ define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB3_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -397,7 +710,7 @@ define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: .LBB3_4: ; %end +; GFX11-TRUE16-NEXT: .LBB6_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h @@ -411,7 +724,7 @@ define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -424,7 +737,7 @@ define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: .LBB3_2: ; %end +; GFX11-FAKE16-NEXT: .LBB6_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -444,34 +757,180 @@ end: ret i16 %phi } +define inreg i16 @bitcast_bf16_to_i16_scalar(bfloat inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_bf16_to_i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_bf16_to_i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_4 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_3: +; VI-NEXT: s_branch .LBB7_2 +; VI-NEXT: .LBB7_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_bf16_to_i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_bf16_to_i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-TRUE16-NEXT: .LBB7_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB7_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB7_2 +; GFX11-TRUE16-NEXT: .LBB7_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_bf16_to_i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-FAKE16-NEXT: .LBB7_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB7_3: +; GFX11-FAKE16-NEXT: s_branch .LBB7_2 +; GFX11-FAKE16-NEXT: .LBB7_4: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd bfloat %a, 0xR40C0 + %a2 = bitcast bfloat %a1 to i16 + br label %end + +cmp.false: + %a3 = bitcast bfloat %a to i16 + br label %end + +end: + %phi = phi i16 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i16 %phi +} + define bfloat @bitcast_f16_to_bf16(half %a, i32 %b) { -; GCN-LABEL: bitcast_f16_to_bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_4 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB4_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: .LBB4_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f16_to_bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_4 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB8_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: .LBB8_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f16_to_bf16: ; VI: ; %bb.0: @@ -512,10 +971,10 @@ define bfloat @bitcast_f16_to_bf16(half %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB4_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 0x200, v0.l -; GFX11-TRUE16-NEXT: .LBB4_4: ; %end +; GFX11-TRUE16-NEXT: .LBB8_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l @@ -551,35 +1010,144 @@ end: ret bfloat %phi } +define inreg bfloat @bitcast_f16_to_bf16_scalar(half inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f16_to_bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, s16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_f16_to_bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_4 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_3: +; VI-NEXT: s_branch .LBB9_2 +; VI-NEXT: .LBB9_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f16_to_bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_4 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_add_f16_e32 v0, s16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_3: +; GFX9-NEXT: s_branch .LBB9_2 +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_f16_to_bf16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB9_4 +; GFX11-TRUE16-NEXT: .LBB9_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, 0x200, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB9_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB9_2 +; GFX11-TRUE16-NEXT: .LBB9_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_f16_to_bf16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB9_4 +; GFX11-FAKE16-NEXT: .LBB9_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, 0x200, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB9_3: +; GFX11-FAKE16-NEXT: s_branch .LBB9_2 +; GFX11-FAKE16-NEXT: .LBB9_4: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd half %a, 0xH0200 + %a2 = bitcast half %a1 to bfloat + br label %end + +cmp.false: + %a3 = bitcast half %a to bfloat + br label %end + +end: + %phi = phi bfloat [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret bfloat %phi +} + define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) { -; GCN-LABEL: bitcast_bf16_to_f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_4 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB5_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: .LBB5_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_bf16_to_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_bf16_to_f16: ; VI: ; %bb.0: @@ -637,7 +1205,7 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB5_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -648,7 +1216,7 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: .LBB5_4: ; %end +; GFX11-TRUE16-NEXT: .LBB10_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h @@ -662,7 +1230,7 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -675,7 +1243,7 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: .LBB5_2: ; %end +; GFX11-FAKE16-NEXT: .LBB10_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -690,6 +1258,154 @@ cmp.false: %a3 = bitcast bfloat %a to half br label %end +end: + %phi = phi half [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret half %phi +} + +define inreg half @bitcast_bf16_to_f16_scalar(bfloat inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_bf16_to_f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_bf16_to_f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_bf16_to_f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_bf16_to_f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-TRUE16-NEXT: .LBB11_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB11_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB11_2 +; GFX11-TRUE16-NEXT: .LBB11_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_bf16_to_f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-FAKE16-NEXT: .LBB11_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB11_3: +; GFX11-FAKE16-NEXT: s_branch .LBB11_2 +; GFX11-FAKE16-NEXT: .LBB11_4: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd bfloat %a, 0xR40C0 + %a2 = bitcast bfloat %a1 to half + br label %end + +cmp.false: + %a3 = bitcast bfloat %a to half + br label %end + end: %phi = phi half [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret half %phi diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll index 32d21e19e8e01..c366836520a82 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll @@ -1,29 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <6 x float> @bitcast_v6i32_to_v6f32(<6 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i32_to_v6f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i32_to_v6f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i32_to_v6f32: ; VI: ; %bb.0: @@ -96,25 +96,141 @@ end: ret <6 x float> %phi } +define inreg <6 x float> @bitcast_v6i32_to_v6f32_scalar(<6 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i32_to_v6f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v6i32_to_v6f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v6i32_to_v6f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v6i32_to_v6f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i32> %a, splat (i32 3) + %a2 = bitcast <6 x i32> %a1 to <6 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x i32> %a to <6 x float> + br label %end + +end: + %phi = phi <6 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x float> %phi +} + define <6 x i32> @bitcast_v6f32_to_v6i32(<6 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f32_to_v6i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f32_to_v6i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f32_to_v6i32: ; VI: ; %bb.0: @@ -184,25 +300,146 @@ end: ret <6 x i32> %phi } +define inreg <6 x i32> @bitcast_v6f32_to_v6i32_scalar(<6 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f32_to_v6i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f32_to_v6i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f32_to_v6i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f32_to_v6i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <6 x float> %a1 to <6 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x float> %a to <6 x i32> + br label %end + +end: + %phi = phi <6 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i32> %phi +} + define <3 x i64> @bitcast_v6i32_to_v3i64(<6 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i32_to_v3i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i32_to_v3i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i32_to_v3i64: ; VI: ; %bb.0: @@ -275,25 +512,141 @@ end: ret <3 x i64> %phi } +define inreg <3 x i64> @bitcast_v6i32_to_v3i64_scalar(<6 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i32_to_v3i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v6i32_to_v3i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v6i32_to_v3i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v6i32_to_v3i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i32> %a, splat (i32 3) + %a2 = bitcast <6 x i32> %a1 to <3 x i64> + br label %end + +cmp.false: + %a3 = bitcast <6 x i32> %a to <3 x i64> + br label %end + +end: + %phi = phi <3 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i64> %phi +} + define <6 x i32> @bitcast_v3i64_to_v6i32(<3 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i64_to_v6i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i64_to_v6i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i64_to_v6i32: ; VI: ; %bb.0: @@ -368,25 +721,141 @@ end: ret <6 x i32> %phi } +define inreg <6 x i32> @bitcast_v3i64_to_v6i32_scalar(<3 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i64_to_v6i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v3i64_to_v6i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v3i64_to_v6i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v3i64_to_v6i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_3 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB7_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: s_branch .LBB7_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i64> %a, splat (i64 3) + %a2 = bitcast <3 x i64> %a1 to <6 x i32> + br label %end + +cmp.false: + %a3 = bitcast <3 x i64> %a to <6 x i32> + br label %end + +end: + %phi = phi <6 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i32> %phi +} + define <3 x double> @bitcast_v6i32_to_v3f64(<6 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i32_to_v3f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i32_to_v3f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i32_to_v3f64: ; VI: ; %bb.0: @@ -459,22 +928,138 @@ end: ret <3 x double> %phi } +define inreg <3 x double> @bitcast_v6i32_to_v3f64_scalar(<6 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i32_to_v3f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v6i32_to_v3f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v6i32_to_v3f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v6i32_to_v3f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i32> %a, splat (i32 3) + %a2 = bitcast <6 x i32> %a1 to <3 x double> + br label %end + +cmp.false: + %a3 = bitcast <6 x i32> %a to <3 x double> + br label %end + +end: + %phi = phi <3 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x double> %phi +} + define <6 x i32> @bitcast_v3f64_to_v6i32(<3 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f64_to_v6i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f64_to_v6i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f64_to_v6i32: ; VI: ; %bb.0: @@ -483,12 +1068,12 @@ define <6 x i32> @bitcast_v3f64_to_v6i32(<3 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -499,12 +1084,12 @@ define <6 x i32> @bitcast_v3f64_to_v6i32(<3 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -516,12 +1101,12 @@ define <6 x i32> @bitcast_v3f64_to_v6i32(<3 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -541,52 +1126,161 @@ end: ret <6 x i32> %phi } +define inreg <6 x i32> @bitcast_v3f64_to_v6i32_scalar(<3 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f64_to_v6i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_4 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_3: +; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f64_to_v6i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f64_to_v6i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f64_to_v6i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <3 x double> %a1 to <6 x i32> + br label %end + +cmp.false: + %a3 = bitcast <3 x double> %a to <6 x i32> + br label %end + +end: + %phi = phi <6 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i32> %phi +} + define <12 x i16> @bitcast_v6i32_to_v12i16(<6 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i32_to_v12i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v6, v12 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i32_to_v12i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v6, v12 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i32_to_v12i16: ; VI: ; %bb.0: @@ -659,85 +1353,229 @@ end: ret <12 x i16> %phi } +define inreg <12 x i16> @bitcast_v6i32_to_v12i16_scalar(<6 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i32_to_v12i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v6i32_to_v12i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v6i32_to_v12i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_v6i32_to_v12i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i32> %a, splat (i32 3) + %a2 = bitcast <6 x i32> %a1 to <12 x i16> + br label %end + +cmp.false: + %a3 = bitcast <6 x i32> %a to <12 x i16> + br label %end + +end: + %phi = phi <12 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i16> %phi +} + define <6 x i32> @bitcast_v12i16_to_v6i32(<12 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i16_to_v6i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v4 -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v12 -; GCN-NEXT: v_or_b32_e32 v1, v1, v16 -; GCN-NEXT: v_or_b32_e32 v2, v2, v17 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v0, v12, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v17, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i16_to_v6i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v15, v4 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_or_b32_e32 v3, v12, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i16_to_v6i32: ; VI: ; %bb.0: @@ -746,7 +1584,7 @@ define <6 x i32> @bitcast_v12i16_to_v6i32(<12 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v7, 3 ; VI-NEXT: v_add_u16_e32 v6, 3, v5 @@ -767,7 +1605,7 @@ define <6 x i32> @bitcast_v12i16_to_v6i32(<12 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v6, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v6, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -824,92 +1662,279 @@ end: ret <6 x i32> %phi } +define inreg <6 x i32> @bitcast_v12i16_to_v6i32_scalar(<12 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i16_to_v6i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v12i16_to_v6i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v12i16_to_v6i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i16_to_v6i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i16> %a, splat (i16 3) + %a2 = bitcast <12 x i16> %a1 to <6 x i32> + br label %end + +cmp.false: + %a3 = bitcast <12 x i16> %a to <6 x i32> + br label %end + +end: + %phi = phi <6 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i32> %phi +} + define <12 x half> @bitcast_v6i32_to_v12f16(<6 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i32_to_v12f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v15, v3 -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v1 -; GCN-NEXT: v_mov_b32_e32 v12, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i32_to_v12f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v15, v3 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v1 +; SI-NEXT: v_mov_b32_e32 v12, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i32_to_v12f16: ; VI: ; %bb.0: @@ -982,125 +2007,284 @@ end: ret <12 x half> %phi } -define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f16_to_v6i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v10 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; GCN-NEXT: v_or_b32_e32 v0, v16, v0 -; GCN-NEXT: v_or_b32_e32 v1, v14, v1 -; GCN-NEXT: v_or_b32_e32 v2, v13, v2 -; GCN-NEXT: v_or_b32_e32 v3, v12, v3 -; GCN-NEXT: v_or_b32_e32 v4, v7, v4 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v11, v10 -; GCN-NEXT: v_or_b32_e32 v4, v7, v9 -; GCN-NEXT: v_or_b32_e32 v5, v6, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <12 x half> @bitcast_v6i32_to_v12f16_scalar(<6 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i32_to_v12f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB17_2 ; -; VI-LABEL: bitcast_v12f16_to_v6i32: +; VI-LABEL: bitcast_v6i32_to_v12f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v6, 0x200 -; VI-NEXT: v_add_f16_sdwa v7, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v7 +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v6i32_to_v12f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v6i32_to_v12f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i32> %a, splat (i32 3) + %a2 = bitcast <6 x i32> %a1 to <12 x half> + br label %end + +cmp.false: + %a3 = bitcast <6 x i32> %a to <12 x half> + br label %end + +end: + %phi = phi <12 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x half> %phi +} + +define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v12f16_to_v6i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f16_to_v6i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v6, 0x200 +; VI-NEXT: v_add_f16_sdwa v7, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v7 ; VI-NEXT: v_add_f16_sdwa v7, v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 ; VI-NEXT: v_or_b32_e32 v4, v4, v7 @@ -1116,7 +2300,7 @@ define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v7 ; VI-NEXT: v_or_b32_e32 v0, v0, v6 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1174,25 +2358,232 @@ end: ret <6 x i32> %phi } +define inreg <6 x i32> @bitcast_v12f16_to_v6i32_scalar(<12 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f16_to_v6i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s26 +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v12f16_to_v6i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v6, v1 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f16_to_v6i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f16_to_v6i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x half> %a, splat (half 0xH0200) + %a2 = bitcast <12 x half> %a1 to <6 x i32> + br label %end + +cmp.false: + %a3 = bitcast <12 x half> %a to <6 x i32> + br label %end + +end: + %phi = phi <6 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i32> %phi +} + define <3 x i64> @bitcast_v6f32_to_v3i64(<6 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f32_to_v3i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f32_to_v3i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f32_to_v3i64: ; VI: ; %bb.0: @@ -1262,25 +2653,146 @@ end: ret <3 x i64> %phi } +define inreg <3 x i64> @bitcast_v6f32_to_v3i64_scalar(<6 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f32_to_v3i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_4 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_3: +; SI-NEXT: s_branch .LBB21_2 +; SI-NEXT: .LBB21_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f32_to_v3i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_4 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_3: +; VI-NEXT: s_branch .LBB21_2 +; VI-NEXT: .LBB21_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f32_to_v3i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_4 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_3: +; GFX9-NEXT: s_branch .LBB21_2 +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f32_to_v3i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: s_branch .LBB21_2 +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <6 x float> %a1 to <3 x i64> + br label %end + +cmp.false: + %a3 = bitcast <6 x float> %a to <3 x i64> + br label %end + +end: + %phi = phi <3 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i64> %phi +} + define <6 x float> @bitcast_v3i64_to_v6f32(<3 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i64_to_v6f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i64_to_v6f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i64_to_v6f32: ; VI: ; %bb.0: @@ -1355,25 +2867,141 @@ end: ret <6 x float> %phi } +define inreg <6 x float> @bitcast_v3i64_to_v6f32_scalar(<3 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i64_to_v6f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v3i64_to_v6f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v3i64_to_v6f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v3i64_to_v6f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB23_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: s_branch .LBB23_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i64> %a, splat (i64 3) + %a2 = bitcast <3 x i64> %a1 to <6 x float> + br label %end + +cmp.false: + %a3 = bitcast <3 x i64> %a to <6 x float> + br label %end + +end: + %phi = phi <6 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x float> %phi +} + define <3 x double> @bitcast_v6f32_to_v3f64(<6 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f32_to_v3f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f32_to_v3f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f32_to_v3f64: ; VI: ; %bb.0: @@ -1443,22 +3071,143 @@ end: ret <3 x double> %phi } +define inreg <3 x double> @bitcast_v6f32_to_v3f64_scalar(<6 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f32_to_v3f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB25_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_4 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_3: +; SI-NEXT: s_branch .LBB25_2 +; SI-NEXT: .LBB25_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f32_to_v3f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB25_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_4 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_3: +; VI-NEXT: s_branch .LBB25_2 +; VI-NEXT: .LBB25_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f32_to_v3f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_4 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_3: +; GFX9-NEXT: s_branch .LBB25_2 +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f32_to_v3f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: s_branch .LBB25_2 +; GFX11-NEXT: .LBB25_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <6 x float> %a1 to <3 x double> + br label %end + +cmp.false: + %a3 = bitcast <6 x float> %a to <3 x double> + br label %end + +end: + %phi = phi <3 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x double> %phi +} + define <6 x float> @bitcast_v3f64_to_v6f32(<3 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f64_to_v6f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f64_to_v6f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f64_to_v6f32: ; VI: ; %bb.0: @@ -1467,12 +3216,12 @@ define <6 x float> @bitcast_v3f64_to_v6f32(<3 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_cbranch_execz .LBB26_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1483,12 +3232,12 @@ define <6 x float> @bitcast_v3f64_to_v6f32(<3 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1500,12 +3249,12 @@ define <6 x float> @bitcast_v3f64_to_v6f32(<3 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1525,52 +3274,161 @@ end: ret <6 x float> %phi } +define inreg <6 x float> @bitcast_v3f64_to_v6f32_scalar(<3 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f64_to_v6f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB27_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_4 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_3: +; SI-NEXT: s_branch .LBB27_2 +; SI-NEXT: .LBB27_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f64_to_v6f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB27_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_4 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_3: +; VI-NEXT: s_branch .LBB27_2 +; VI-NEXT: .LBB27_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f64_to_v6f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_4 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_3: +; GFX9-NEXT: s_branch .LBB27_2 +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f64_to_v6f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_4 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: s_branch .LBB27_2 +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <3 x double> %a1 to <6 x float> + br label %end + +cmp.false: + %a3 = bitcast <3 x double> %a to <6 x float> + br label %end + +end: + %phi = phi <6 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x float> %phi +} + define <12 x i16> @bitcast_v6f32_to_v12i16(<6 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f32_to_v12i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v6, v12 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f32_to_v12i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v6, v12 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f32_to_v12i16: ; VI: ; %bb.0: @@ -1640,85 +3498,236 @@ end: ret <12 x i16> %phi } +define inreg <12 x i16> @bitcast_v6f32_to_v12i16_scalar(<6 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f32_to_v12i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB29_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB29_4 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f32_to_v12i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f32_to_v12i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f32_to_v12i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <6 x float> %a1 to <12 x i16> + br label %end + +cmp.false: + %a3 = bitcast <6 x float> %a to <12 x i16> + br label %end + +end: + %phi = phi <12 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i16> %phi +} + define <6 x float> @bitcast_v12i16_to_v6f32(<12 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i16_to_v6f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v4 -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_4 -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB15_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v12 -; GCN-NEXT: v_or_b32_e32 v1, v1, v16 -; GCN-NEXT: v_or_b32_e32 v2, v2, v17 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: .LBB15_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v0, v12, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v17, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i16_to_v6f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v15, v4 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_4 +; SI-NEXT: .LBB30_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB30_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: .LBB30_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_or_b32_e32 v3, v12, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i16_to_v6f32: ; VI: ; %bb.0: @@ -1727,7 +3736,7 @@ define <6 x float> @bitcast_v12i16_to_v6f32(<12 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v7, 3 ; VI-NEXT: v_add_u16_e32 v6, 3, v5 @@ -1748,7 +3757,7 @@ define <6 x float> @bitcast_v12i16_to_v6f32(<12 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v6, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v6, v0 -; VI-NEXT: .LBB15_2: ; %end +; VI-NEXT: .LBB30_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1805,92 +3814,279 @@ end: ret <6 x float> %phi } +define inreg <6 x float> @bitcast_v12i16_to_v6f32_scalar(<12 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i16_to_v6f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v12i16_to_v6f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v12i16_to_v6f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_4 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_3: +; GFX9-NEXT: s_branch .LBB31_2 +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i16_to_v6f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_3: +; GFX11-NEXT: s_branch .LBB31_2 +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i16> %a, splat (i16 3) + %a2 = bitcast <12 x i16> %a1 to <6 x float> + br label %end + +cmp.false: + %a3 = bitcast <12 x i16> %a to <6 x float> + br label %end + +end: + %phi = phi <6 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x float> %phi +} + define <12 x half> @bitcast_v6f32_to_v12f16(<6 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f32_to_v12f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v15, v3 -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v1 -; GCN-NEXT: v_mov_b32_e32 v12, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_4 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB16_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: .LBB16_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f32_to_v12f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v15, v3 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v1 +; SI-NEXT: v_mov_b32_e32 v12, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_4 +; SI-NEXT: .LBB32_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB32_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: .LBB32_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f32_to_v12f16: ; VI: ; %bb.0: @@ -1960,111 +4156,279 @@ end: ret <12 x half> %phi } +define inreg <12 x half> @bitcast_v6f32_to_v12f16_scalar(<6 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f32_to_v12f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v6f32_to_v12f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_3: +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f32_to_v12f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f32_to_v12f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <6 x float> %a1 to <12 x half> + br label %end + +cmp.false: + %a3 = bitcast <6 x float> %a to <12 x half> + br label %end + +end: + %phi = phi <12 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x half> %phi +} + define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f16_to_v6f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v10 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB17_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB17_4 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB17_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; GCN-NEXT: v_or_b32_e32 v0, v16, v0 -; GCN-NEXT: v_or_b32_e32 v1, v14, v1 -; GCN-NEXT: v_or_b32_e32 v2, v13, v2 -; GCN-NEXT: v_or_b32_e32 v3, v12, v3 -; GCN-NEXT: v_or_b32_e32 v4, v7, v4 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: .LBB17_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v11, v10 -; GCN-NEXT: v_or_b32_e32 v4, v7, v9 -; GCN-NEXT: v_or_b32_e32 v5, v6, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f16_to_v6f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB34_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB34_4 +; SI-NEXT: .LBB34_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB34_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: .LBB34_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v6f32: ; VI: ; %bb.0: @@ -2073,7 +4437,7 @@ define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v6, 0x200 ; VI-NEXT: v_add_f16_sdwa v7, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -2094,7 +4458,7 @@ define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v7 ; VI-NEXT: v_or_b32_e32 v0, v0, v6 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2152,25 +4516,232 @@ end: ret <6 x float> %phi } +define inreg <6 x float> @bitcast_v12f16_to_v6f32_scalar(<12 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f16_to_v6f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s26 +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v12f16_to_v6f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB35_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v6, v1 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: +; VI-NEXT: s_branch .LBB35_2 +; VI-NEXT: .LBB35_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f16_to_v6f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: +; GFX9-NEXT: s_branch .LBB35_2 +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f16_to_v6f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x half> %a, splat (half 0xH0200) + %a2 = bitcast <12 x half> %a1 to <6 x float> + br label %end + +cmp.false: + %a3 = bitcast <12 x half> %a to <6 x float> + br label %end + +end: + %phi = phi <6 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x float> %phi +} + define <3 x double> @bitcast_v3i64_to_v3f64(<3 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i64_to_v3f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i64_to_v3f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i64_to_v3f64: ; VI: ; %bb.0: @@ -2245,22 +4816,137 @@ end: ret <3 x double> %phi } +define inreg <3 x double> @bitcast_v3i64_to_v3f64_scalar(<3 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i64_to_v3f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 +; +; VI-LABEL: bitcast_v3i64_to_v3f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: .LBB37_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 +; +; GFX9-LABEL: bitcast_v3i64_to_v3f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: .LBB37_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 +; +; GFX11-LABEL: bitcast_v3i64_to_v3f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_3 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: .LBB37_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: s_branch .LBB37_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i64> %a, splat (i64 3) + %a2 = bitcast <3 x i64> %a1 to <3 x double> + br label %end + +cmp.false: + %a3 = bitcast <3 x i64> %a to <3 x double> + br label %end + +end: + %phi = phi <3 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x double> %phi +} + define <3 x i64> @bitcast_v3f64_to_v3i64(<3 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f64_to_v3i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f64_to_v3i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f64_to_v3i64: ; VI: ; %bb.0: @@ -2269,12 +4955,12 @@ define <3 x i64> @bitcast_v3f64_to_v3i64(<3 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2285,12 +4971,12 @@ define <3 x i64> @bitcast_v3f64_to_v3i64(<3 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end +; GFX9-NEXT: .LBB38_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2302,12 +4988,12 @@ define <3 x i64> @bitcast_v3f64_to_v3i64(<3 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2327,52 +5013,161 @@ end: ret <3 x i64> %phi } +define inreg <3 x i64> @bitcast_v3f64_to_v3i64_scalar(<3 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f64_to_v3i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB39_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_4 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_3: +; SI-NEXT: s_branch .LBB39_2 +; SI-NEXT: .LBB39_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f64_to_v3i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB39_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_4 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_3: +; VI-NEXT: s_branch .LBB39_2 +; VI-NEXT: .LBB39_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f64_to_v3i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_4 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_3: +; GFX9-NEXT: s_branch .LBB39_2 +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f64_to_v3i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: s_branch .LBB39_2 +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <3 x double> %a1 to <3 x i64> + br label %end + +cmp.false: + %a3 = bitcast <3 x double> %a to <3 x i64> + br label %end + +end: + %phi = phi <3 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i64> %phi +} + define <12 x i16> @bitcast_v3i64_to_v12i16(<3 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i64_to_v12i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v6, v12 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i64_to_v12i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v6, v12 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i64_to_v12i16: ; VI: ; %bb.0: @@ -2447,85 +5242,229 @@ end: ret <12 x i16> %phi } +define inreg <12 x i16> @bitcast_v3i64_to_v12i16_scalar(<3 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i64_to_v12i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v3i64_to_v12i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v3i64_to_v12i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-LABEL: bitcast_v3i64_to_v12i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB41_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: s_branch .LBB41_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i64> %a, splat (i64 3) + %a2 = bitcast <3 x i64> %a1 to <12 x i16> + br label %end + +cmp.false: + %a3 = bitcast <3 x i64> %a to <12 x i16> + br label %end + +end: + %phi = phi <12 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i16> %phi +} + define <3 x i64> @bitcast_v12i16_to_v3i64(<12 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i16_to_v3i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v4 -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_4 -; GCN-NEXT: .LBB21_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB21_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v12 -; GCN-NEXT: v_or_b32_e32 v1, v1, v16 -; GCN-NEXT: v_or_b32_e32 v2, v2, v17 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: .LBB21_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v0, v12, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v17, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i16_to_v3i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v15, v4 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_4 +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB42_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: .LBB42_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_or_b32_e32 v3, v12, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i16_to_v3i64: ; VI: ; %bb.0: @@ -2534,7 +5473,7 @@ define <3 x i64> @bitcast_v12i16_to_v3i64(<12 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v7, 3 ; VI-NEXT: v_add_u16_e32 v6, 3, v5 @@ -2555,7 +5494,7 @@ define <3 x i64> @bitcast_v12i16_to_v3i64(<12 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v6, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v6, v0 -; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: .LBB42_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2612,92 +5551,279 @@ end: ret <3 x i64> %phi } +define inreg <3 x i64> @bitcast_v12i16_to_v3i64_scalar(<12 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i16_to_v3i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v12i16_to_v3i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v12i16_to_v3i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_4 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_3: +; GFX9-NEXT: s_branch .LBB43_2 +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i16_to_v3i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: +; GFX11-NEXT: s_branch .LBB43_2 +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i16> %a, splat (i16 3) + %a2 = bitcast <12 x i16> %a1 to <3 x i64> + br label %end + +cmp.false: + %a3 = bitcast <12 x i16> %a to <3 x i64> + br label %end + +end: + %phi = phi <3 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i64> %phi +} + define <12 x half> @bitcast_v3i64_to_v12f16(<3 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i64_to_v12f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v5 -; GCN-NEXT: v_mov_b32_e32 v13, v4 -; GCN-NEXT: v_mov_b32_e32 v16, v3 -; GCN-NEXT: v_mov_b32_e32 v15, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v1 -; GCN-NEXT: v_mov_b32_e32 v12, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_4 -; GCN-NEXT: .LBB22_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB22_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: .LBB22_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v14, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i64_to_v12f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v5 +; SI-NEXT: v_mov_b32_e32 v12, v4 +; SI-NEXT: v_mov_b32_e32 v15, v3 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v17, v1 +; SI-NEXT: v_mov_b32_e32 v16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_4 +; SI-NEXT: .LBB44_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB44_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: .LBB44_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v13, vcc +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i64_to_v12f16: ; VI: ; %bb.0: @@ -2735,26 +5861,185 @@ define <12 x half> @bitcast_v3i64_to_v12f16(<3 x i64> %a, i32 %b) { ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v3i64_to_v12f16: +; GFX11-LABEL: bitcast_v3i64_to_v12f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v6 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i64> %a, splat (i64 3) + %a2 = bitcast <3 x i64> %a1 to <12 x half> + br label %end + +cmp.false: + %a3 = bitcast <3 x i64> %a to <12 x half> + br label %end + +end: + %phi = phi <12 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x half> %phi +} + +define inreg <12 x half> @bitcast_v3i64_to_v12f16_scalar(<3 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i64_to_v12f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_add_u32 s8, s18, 3 +; SI-NEXT: s_addc_u32 s9, s19, 0 +; SI-NEXT: s_lshr_b32 s10, s8, 16 +; SI-NEXT: s_lshr_b32 s11, s9, 16 +; SI-NEXT: s_add_u32 s12, s20, 3 +; SI-NEXT: s_addc_u32 s13, s21, 0 +; SI-NEXT: s_lshr_b32 s14, s12, 16 +; SI-NEXT: s_lshr_b32 s15, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v3i64_to_v12f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v3i64_to_v12f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-LABEL: bitcast_v3i64_to_v12f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v6 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB45_3: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: s_branch .LBB45_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2773,110 +6058,110 @@ end: } define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f16_to_v3i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v10 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_4 -; GCN-NEXT: .LBB23_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB23_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; GCN-NEXT: v_or_b32_e32 v0, v16, v0 -; GCN-NEXT: v_or_b32_e32 v1, v14, v1 -; GCN-NEXT: v_or_b32_e32 v2, v13, v2 -; GCN-NEXT: v_or_b32_e32 v3, v12, v3 -; GCN-NEXT: v_or_b32_e32 v4, v7, v4 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: .LBB23_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v11, v10 -; GCN-NEXT: v_or_b32_e32 v4, v7, v9 -; GCN-NEXT: v_or_b32_e32 v5, v6, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f16_to_v3i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_4 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB46_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: .LBB46_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v3i64: ; VI: ; %bb.0: @@ -2885,7 +6170,7 @@ define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v6, 0x200 ; VI-NEXT: v_add_f16_sdwa v7, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -2906,7 +6191,7 @@ define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v7 ; VI-NEXT: v_or_b32_e32 v0, v0, v6 -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2964,55 +6249,262 @@ end: ret <3 x i64> %phi } +define inreg <3 x i64> @bitcast_v12f16_to_v3i64_scalar(<12 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f16_to_v3i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s26 +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v12f16_to_v3i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v6, v1 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f16_to_v3i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f16_to_v3i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_3: +; GFX11-NEXT: s_branch .LBB47_2 +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x half> %a, splat (half 0xH0200) + %a2 = bitcast <12 x half> %a1 to <3 x i64> + br label %end + +cmp.false: + %a3 = bitcast <12 x half> %a to <3 x i64> + br label %end + +end: + %phi = phi <3 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i64> %phi +} + define <12 x i16> @bitcast_v3f64_to_v12i16(<3 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f64_to_v12i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v13, v5 -; GCN-NEXT: v_mov_b32_e32 v12, v4 -; GCN-NEXT: v_mov_b32_e32 v15, v3 -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v1 -; GCN-NEXT: v_mov_b32_e32 v16, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v5, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v1, v17, v16, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v17 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v5, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v1, v17, v16, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v17 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v16 -; GCN-NEXT: v_mov_b32_e32 v2, v17 -; GCN-NEXT: v_mov_b32_e32 v4, v14 -; GCN-NEXT: v_mov_b32_e32 v6, v15 -; GCN-NEXT: v_mov_b32_e32 v8, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f64_to_v12i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v5 +; SI-NEXT: v_mov_b32_e32 v12, v4 +; SI-NEXT: v_mov_b32_e32 v15, v3 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v17, v1 +; SI-NEXT: v_mov_b32_e32 v16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v9, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v5, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_alignbit_b32 v9, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v5, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v16 +; SI-NEXT: v_mov_b32_e32 v2, v17 +; SI-NEXT: v_mov_b32_e32 v4, v14 +; SI-NEXT: v_mov_b32_e32 v6, v15 +; SI-NEXT: v_mov_b32_e32 v8, v12 +; SI-NEXT: v_mov_b32_e32 v10, v13 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f64_to_v12i16: ; VI: ; %bb.0: @@ -3021,12 +6513,12 @@ define <12 x i16> @bitcast_v3f64_to_v12i16(<3 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: .LBB48_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3037,12 +6529,12 @@ define <12 x i16> @bitcast_v3f64_to_v12i16(<3 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: .LBB48_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3054,12 +6546,12 @@ define <12 x i16> @bitcast_v3f64_to_v12i16(<3 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: .LBB48_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3079,85 +6571,231 @@ end: ret <12 x i16> %phi } +define inreg <12 x i16> @bitcast_v3f64_to_v12i16_scalar(<3 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f64_to_v12i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB49_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB49_4 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[16:17], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[14:15], s[18:19], 1.0 +; SI-NEXT: v_alignbit_b32 v9, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v5, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; SI-NEXT: s_branch .LBB49_5 +; SI-NEXT: .LBB49_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB49_2 +; SI-NEXT: .LBB49_4: +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: .LBB49_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v16 +; SI-NEXT: v_mov_b32_e32 v2, v17 +; SI-NEXT: v_mov_b32_e32 v4, v14 +; SI-NEXT: v_mov_b32_e32 v6, v15 +; SI-NEXT: v_mov_b32_e32 v8, v12 +; SI-NEXT: v_mov_b32_e32 v10, v13 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f64_to_v12i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB49_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB49_4 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_3: +; VI-NEXT: s_branch .LBB49_2 +; VI-NEXT: .LBB49_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f64_to_v12i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f64_to_v12i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-NEXT: .LBB49_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB49_3: +; GFX11-NEXT: s_branch .LBB49_2 +; GFX11-NEXT: .LBB49_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <3 x double> %a1 to <12 x i16> + br label %end + +cmp.false: + %a3 = bitcast <3 x double> %a to <12 x i16> + br label %end + +end: + %phi = phi <12 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i16> %phi +} + define <3 x double> @bitcast_v12i16_to_v3f64(<12 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i16_to_v3f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v4 -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_4 -; GCN-NEXT: .LBB25_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB25_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v12 -; GCN-NEXT: v_or_b32_e32 v1, v1, v16 -; GCN-NEXT: v_or_b32_e32 v2, v2, v17 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: .LBB25_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v0, v12, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v17, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i16_to_v3f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v15, v4 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_4 +; SI-NEXT: .LBB50_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB50_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: .LBB50_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_or_b32_e32 v3, v12, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i16_to_v3f64: ; VI: ; %bb.0: @@ -3166,7 +6804,7 @@ define <3 x double> @bitcast_v12i16_to_v3f64(<12 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v7, 3 ; VI-NEXT: v_add_u16_e32 v6, 3, v5 @@ -3187,7 +6825,7 @@ define <3 x double> @bitcast_v12i16_to_v3f64(<12 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v6, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v6, v0 -; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3244,82 +6882,269 @@ end: ret <3 x double> %phi } +define inreg <3 x double> @bitcast_v12i16_to_v3f64_scalar(<12 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i16_to_v3f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v12i16_to_v3f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v12i16_to_v3f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB51_4 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_3: +; GFX9-NEXT: s_branch .LBB51_2 +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i16_to_v3f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB51_4 +; GFX11-NEXT: .LBB51_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB51_3: +; GFX11-NEXT: s_branch .LBB51_2 +; GFX11-NEXT: .LBB51_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i16> %a, splat (i16 3) + %a2 = bitcast <12 x i16> %a1 to <3 x double> + br label %end + +cmp.false: + %a3 = bitcast <12 x i16> %a to <3 x double> + br label %end + +end: + %phi = phi <3 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x double> %phi +} + define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f64_to_v12f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v0 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v17 -; GCN-NEXT: v_mov_b32_e32 v2, v13 -; GCN-NEXT: v_mov_b32_e32 v3, v16 -; GCN-NEXT: v_mov_b32_e32 v4, v14 -; GCN-NEXT: v_mov_b32_e32 v5, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f64_to_v12f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v15 +; SI-NEXT: v_mov_b32_e32 v1, v17 +; SI-NEXT: v_mov_b32_e32 v2, v13 +; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_mov_b32_e32 v4, v12 +; SI-NEXT: v_mov_b32_e32 v5, v14 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f64_to_v12f16: ; VI: ; %bb.0: @@ -3328,12 +7153,12 @@ define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: .LBB52_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3344,12 +7169,12 @@ define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: .LBB52_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3361,12 +7186,12 @@ define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: .LBB52_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3386,111 +7211,267 @@ end: ret <12 x half> %phi } +define inreg <12 x half> @bitcast_v3f64_to_v12f16_scalar(<3 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f64_to_v12f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[14:15], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[16:17], s[18:19], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_add_f64 v[12:13], s[16:17], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v3f64_to_v12f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB53_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_4 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_3: +; VI-NEXT: s_branch .LBB53_2 +; VI-NEXT: .LBB53_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f64_to_v12f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_4 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_3: +; GFX9-NEXT: s_branch .LBB53_2 +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f64_to_v12f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_4 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_3: +; GFX11-NEXT: s_branch .LBB53_2 +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <3 x double> %a1 to <12 x half> + br label %end + +cmp.false: + %a3 = bitcast <3 x double> %a to <12 x half> + br label %end + +end: + %phi = phi <12 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x half> %phi +} + define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f16_to_v3f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v10 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB27_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB27_4 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB27_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; GCN-NEXT: v_or_b32_e32 v0, v16, v0 -; GCN-NEXT: v_or_b32_e32 v1, v14, v1 -; GCN-NEXT: v_or_b32_e32 v2, v13, v2 -; GCN-NEXT: v_or_b32_e32 v3, v12, v3 -; GCN-NEXT: v_or_b32_e32 v4, v7, v4 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: .LBB27_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v11, v10 -; GCN-NEXT: v_or_b32_e32 v4, v7, v9 -; GCN-NEXT: v_or_b32_e32 v5, v6, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f16_to_v3f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_4 +; SI-NEXT: .LBB54_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB54_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: .LBB54_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v3f64: ; VI: ; %bb.0: @@ -3499,7 +7480,7 @@ define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v6, 0x200 ; VI-NEXT: v_add_f16_sdwa v7, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3520,7 +7501,7 @@ define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v7 ; VI-NEXT: v_or_b32_e32 v0, v0, v6 -; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3578,98 +7559,306 @@ end: ret <3 x double> %phi } +define inreg <3 x double> @bitcast_v12f16_to_v3f64_scalar(<12 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f16_to_v3f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s26 +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v12f16_to_v3f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB55_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_4 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v6, v1 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_3: +; VI-NEXT: s_branch .LBB55_2 +; VI-NEXT: .LBB55_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f16_to_v3f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: +; GFX9-NEXT: s_branch .LBB55_2 +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f16_to_v3f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x half> %a, splat (half 0xH0200) + %a2 = bitcast <12 x half> %a1 to <3 x double> + br label %end + +cmp.false: + %a3 = bitcast <12 x half> %a to <3 x double> + br label %end + +end: + %phi = phi <3 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x double> %phi +} + define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i16_to_v12f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v23, v11 -; GCN-NEXT: v_mov_b32_e32 v22, v10 -; GCN-NEXT: v_mov_b32_e32 v21, v9 -; GCN-NEXT: v_mov_b32_e32 v20, v8 -; GCN-NEXT: v_mov_b32_e32 v19, v7 -; GCN-NEXT: v_mov_b32_e32 v18, v6 -; GCN-NEXT: v_mov_b32_e32 v17, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v15, v3 -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v1 -; GCN-NEXT: v_mov_b32_e32 v24, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB28_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB28_4 -; GCN-NEXT: .LBB28_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB28_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: .LBB28_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i16_to_v12f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v11 +; SI-NEXT: v_mov_b32_e32 v22, v10 +; SI-NEXT: v_mov_b32_e32 v21, v9 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v19, v7 +; SI-NEXT: v_mov_b32_e32 v18, v6 +; SI-NEXT: v_mov_b32_e32 v17, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v15, v3 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v1 +; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB56_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB56_4 +; SI-NEXT: .LBB56_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB56_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: .LBB56_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i16_to_v12f16: ; VI: ; %bb.0: @@ -3678,7 +7867,7 @@ define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v6, 3 ; VI-NEXT: v_add_u16_sdwa v7, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3699,7 +7888,7 @@ define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v9 ; VI-NEXT: v_or_b32_e32 v1, v1, v8 ; VI-NEXT: v_or_b32_e32 v0, v0, v7 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3756,82 +7945,265 @@ end: ret <12 x half> %phi } +define inreg <12 x half> @bitcast_v12i16_to_v12f16_scalar(<12 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i16_to_v12f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v12i16_to_v12f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v12i16_to_v12f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i16_to_v12f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_3: +; GFX11-NEXT: s_branch .LBB57_2 +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i16> %a, splat (i16 3) + %a2 = bitcast <12 x i16> %a1 to <12 x half> + br label %end + +cmp.false: + %a3 = bitcast <12 x i16> %a to <12 x half> + br label %end + +end: + %phi = phi <12 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x half> %phi +} + define <12 x i16> @bitcast_v12f16_to_v12i16(<12 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f16_to_v12i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_or_b32_e32 v10, v10, v12 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v2, v2, v14 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f16_to_v12i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v12i16: ; VI: ; %bb.0: @@ -3840,7 +8212,7 @@ define <12 x i16> @bitcast_v12f16_to_v12i16(<12 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v7, 0x200 ; VI-NEXT: v_add_f16_e32 v6, 0x200, v0 @@ -3861,7 +8233,7 @@ define <12 x i16> @bitcast_v12f16_to_v12i16(<12 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v9, v2 ; VI-NEXT: v_or_b32_e32 v1, v8, v1 ; VI-NEXT: v_or_b32_e32 v0, v6, v0 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3918,3 +8290,209 @@ end: %phi = phi <12 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <12 x i16> %phi } + +define inreg <12 x i16> @bitcast_v12f16_to_v12i16_scalar(<12 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f16_to_v12i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v12f16_to_v12i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_add_f16_e32 v6, s16, v0 +; VI-NEXT: v_add_f16_sdwa v7, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v8, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v9, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v10, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: v_add_f16_sdwa v11, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s21, v0 +; VI-NEXT: v_add_f16_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v0 +; VI-NEXT: v_or_b32_e32 v4, v4, v11 +; VI-NEXT: v_or_b32_e32 v3, v3, v10 +; VI-NEXT: v_or_b32_e32 v2, v2, v9 +; VI-NEXT: v_or_b32_e32 v1, v1, v8 +; VI-NEXT: v_or_b32_e32 v0, v6, v7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f16_to_v12i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f16_to_v12i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_3: +; GFX11-NEXT: s_branch .LBB59_2 +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x half> %a, splat (half 0xH0200) + %a2 = bitcast <12 x half> %a1 to <12 x i16> + br label %end + +cmp.false: + %a3 = bitcast <12 x half> %a to <12 x i16> + br label %end + +end: + %phi = phi <12 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i16> %phi +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FAKE16: {{.*}} +; GFX11-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll index 3ec705baa9c82..48070b75804f5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll @@ -1,30 +1,30 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <7 x float> @bitcast_v7i32_to_v7f32(<7 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v7i32_to_v7f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7i32_to_v7f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i32_to_v7f32: ; VI: ; %bb.0: @@ -101,26 +101,150 @@ end: ret <7 x float> %phi } +define inreg <7 x float> @bitcast_v7i32_to_v7f32_scalar(<7 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7i32_to_v7f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s23, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v7i32_to_v7f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v7i32_to_v7f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v7i32_to_v7f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_mov_b32_e32 v6, s18 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i32> %a, splat (i32 3) + %a2 = bitcast <7 x i32> %a1 to <7 x float> + br label %end + +cmp.false: + %a3 = bitcast <7 x i32> %a to <7 x float> + br label %end + +end: + %phi = phi <7 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x float> %phi +} + define <7 x i32> @bitcast_v7f32_to_v7i32(<7 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v7f32_to_v7i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7f32_to_v7i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f32_to_v7i32: ; VI: ; %bb.0: @@ -193,60 +317,191 @@ end: ret <7 x i32> %phi } +define inreg <7 x i32> @bitcast_v7f32_to_v7i32_scalar(<7 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7f32_to_v7i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s23, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7f32_to_v7i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f32_to_v7i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f32_to_v7i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <7 x float> %a1 to <7 x i32> + br label %end + +cmp.false: + %a3 = bitcast <7 x float> %a to <7 x i32> + br label %end + +end: + %phi = phi <7 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i32> %phi +} + define <14 x i16> @bitcast_v7i32_to_v14i16(<7 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v7i32_to_v14i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB2_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v13, s4, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v13, s4, v12, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7i32_to_v14i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_4 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB4_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v13, s4, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: .LBB4_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v13, s4, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i32_to_v14i16: ; VI: ; %bb.0: @@ -294,7 +549,7 @@ define <14 x i16> @bitcast_v7i32_to_v14i16(<7 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 ; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 @@ -303,7 +558,7 @@ define <14 x i16> @bitcast_v7i32_to_v14i16(<7 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -323,95 +578,252 @@ end: ret <14 x i16> %phi } +define inreg <14 x i16> @bitcast_v7i32_to_v14i16_scalar(<7 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7i32_to_v14i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s23, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s4, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s6 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v7i32_to_v14i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v7i32_to_v14i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v7i32_to_v14i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_mov_b32_e32 v6, s18 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i32> %a, splat (i32 3) + %a2 = bitcast <7 x i32> %a1 to <14 x i16> + br label %end + +cmp.false: + %a3 = bitcast <7 x i32> %a to <14 x i16> + br label %end + +end: + %phi = phi <14 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i16> %phi +} + define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i16_to_v7i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, v6 -; GCN-NEXT: v_mov_b32_e32 v17, v4 -; GCN-NEXT: v_mov_b32_e32 v16, v2 -; GCN-NEXT: v_mov_b32_e32 v15, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v0, v0, v14 -; GCN-NEXT: v_or_b32_e32 v1, v1, v19 -; GCN-NEXT: v_or_b32_e32 v2, v2, v20 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v15 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v14, v0 -; GCN-NEXT: v_or_b32_e32 v1, v19, v1 -; GCN-NEXT: v_or_b32_e32 v2, v20, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i16_to_v7i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v6 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v21 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v2, v20, v2 +; SI-NEXT: v_or_b32_e32 v3, v19, v3 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i16_to_v7i32: ; VI: ; %bb.0: @@ -420,7 +832,7 @@ define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v8, 3 ; VI-NEXT: v_add_u16_e32 v7, 3, v6 @@ -444,7 +856,7 @@ define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v7, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v7, v0 -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -475,7 +887,7 @@ define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] @@ -484,7 +896,7 @@ define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -504,103 +916,311 @@ end: ret <7 x i32> %phi } +define inreg <7 x i32> @bitcast_v14i16_to_v7i32_scalar(<14 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i16_to_v7i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v14i16_to_v7i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v14i16_to_v7i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14i16_to_v7i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: s_branch .LBB7_2 +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i16> %a, splat (i16 3) + %a2 = bitcast <14 x i16> %a1 to <7 x i32> + br label %end + +cmp.false: + %a3 = bitcast <14 x i16> %a to <7 x i32> + br label %end + +end: + %phi = phi <7 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i32> %phi +} + define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v7i32_to_v14f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v5 -; GCN-NEXT: v_mov_b32_e32 v18, v4 -; GCN-NEXT: v_mov_b32_e32 v17, v3 -; GCN-NEXT: v_mov_b32_e32 v16, v2 -; GCN-NEXT: v_mov_b32_e32 v15, v1 -; GCN-NEXT: v_mov_b32_e32 v14, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_4 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB4_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: .LBB4_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7i32_to_v14f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_4 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB8_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: .LBB8_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i32_to_v14f16: ; VI: ; %bb.0: @@ -648,7 +1268,7 @@ define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 ; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 @@ -657,7 +1277,7 @@ define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -677,125 +1297,299 @@ end: ret <14 x half> %phi } +define inreg <14 x half> @bitcast_v7i32_to_v14f16_scalar(<7 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7i32_to_v14f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s23, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v7i32_to_v14f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v7i32_to_v14f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v7i32_to_v14f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_mov_b32_e32 v6, s18 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i32> %a, splat (i32 3) + %a2 = bitcast <7 x i32> %a1 to <14 x half> + br label %end + +cmp.false: + %a3 = bitcast <7 x i32> %a to <14 x half> + br label %end + +end: + %phi = phi <14 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x half> %phi +} + define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f16_to_v7i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v12 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_4 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB5_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GCN-NEXT: v_or_b32_e32 v0, v20, v0 -; GCN-NEXT: v_or_b32_e32 v1, v18, v1 -; GCN-NEXT: v_or_b32_e32 v2, v16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v15, v3 -; GCN-NEXT: v_or_b32_e32 v4, v14, v4 -; GCN-NEXT: v_or_b32_e32 v5, v8, v5 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: .LBB5_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v10, v6 -; GCN-NEXT: v_or_b32_e32 v4, v13, v12 -; GCN-NEXT: v_or_b32_e32 v5, v8, v11 -; GCN-NEXT: v_or_b32_e32 v6, v7, v9 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14f16_to_v7i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v3, v17, v3 +; SI-NEXT: v_or_b32_e32 v4, v15, v4 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f16_to_v7i32: ; VI: ; %bb.0: @@ -804,7 +1598,7 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v7, 0x200 ; VI-NEXT: v_add_f16_sdwa v8, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -828,7 +1622,7 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v8 ; VI-NEXT: v_or_b32_e32 v0, v0, v7 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -860,7 +1654,7 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] @@ -869,7 +1663,7 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -889,60 +1683,292 @@ end: ret <7 x i32> %phi } +define inreg <7 x i32> @bitcast_v14f16_to_v7i32_scalar(<14 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f16_to_v7i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: v_or_b32_e32 v4, v11, v4 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v14f16_to_v7i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v7, v1 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: v_add_f16_sdwa v7, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f16_to_v7i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f16_to_v7i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x half> %a, splat (half 0xH0200) + %a2 = bitcast <14 x half> %a1 to <7 x i32> + br label %end + +cmp.false: + %a3 = bitcast <14 x half> %a to <7 x i32> + br label %end + +end: + %phi = phi <7 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i32> %phi +} + define <14 x i16> @bitcast_v7f32_to_v14i16(<7 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v7f32_to_v14i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_4 -; GCN-NEXT: .LBB6_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB6_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v13, s4, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: .LBB6_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v13, s4, v12, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7f32_to_v14i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_4 +; SI-NEXT: .LBB12_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB12_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v13, s4, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: .LBB12_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v13, s4, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f32_to_v14i16: ; VI: ; %bb.0: @@ -1015,95 +2041,256 @@ end: ret <14 x i16> %phi } +define inreg <14 x i16> @bitcast_v7f32_to_v14i16_scalar(<7 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7f32_to_v14i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s23, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_4 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v13, s4, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_branch .LBB13_2 +; SI-NEXT: .LBB13_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7f32_to_v14i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_4 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_3: +; VI-NEXT: s_branch .LBB13_2 +; VI-NEXT: .LBB13_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f32_to_v14i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_4 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_3: +; GFX9-NEXT: s_branch .LBB13_2 +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f32_to_v14i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_4 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_3: +; GFX11-NEXT: s_branch .LBB13_2 +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <7 x float> %a1 to <14 x i16> + br label %end + +cmp.false: + %a3 = bitcast <7 x float> %a to <14 x i16> + br label %end + +end: + %phi = phi <14 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i16> %phi +} + define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i16_to_v7f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, v6 -; GCN-NEXT: v_mov_b32_e32 v17, v4 -; GCN-NEXT: v_mov_b32_e32 v16, v2 -; GCN-NEXT: v_mov_b32_e32 v15, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v0, v0, v14 -; GCN-NEXT: v_or_b32_e32 v1, v1, v19 -; GCN-NEXT: v_or_b32_e32 v2, v2, v20 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v15 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v14, v0 -; GCN-NEXT: v_or_b32_e32 v1, v19, v1 -; GCN-NEXT: v_or_b32_e32 v2, v20, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i16_to_v7f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v6 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v21 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v2, v20, v2 +; SI-NEXT: v_or_b32_e32 v3, v19, v3 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i16_to_v7f32: ; VI: ; %bb.0: @@ -1112,7 +2299,7 @@ define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v8, 3 ; VI-NEXT: v_add_u16_e32 v7, 3, v6 @@ -1136,7 +2323,7 @@ define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v7, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v7, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1167,7 +2354,7 @@ define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] @@ -1176,7 +2363,7 @@ define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: .LBB14_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1196,103 +2383,311 @@ end: ret <7 x float> %phi } +define inreg <7 x float> @bitcast_v14i16_to_v7f32_scalar(<14 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i16_to_v7f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v14i16_to_v7f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v14i16_to_v7f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14i16_to_v7f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i16> %a, splat (i16 3) + %a2 = bitcast <14 x i16> %a1 to <7 x float> + br label %end + +cmp.false: + %a3 = bitcast <14 x i16> %a to <7 x float> + br label %end + +end: + %phi = phi <7 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x float> %phi +} + define <14 x half> @bitcast_v7f32_to_v14f16(<7 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v7f32_to_v14f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v5 -; GCN-NEXT: v_mov_b32_e32 v18, v4 -; GCN-NEXT: v_mov_b32_e32 v17, v3 -; GCN-NEXT: v_mov_b32_e32 v16, v2 -; GCN-NEXT: v_mov_b32_e32 v15, v1 -; GCN-NEXT: v_mov_b32_e32 v14, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7f32_to_v14f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f32_to_v14f16: ; VI: ; %bb.0: @@ -1365,125 +2760,306 @@ end: ret <14 x half> %phi } +define inreg <14 x half> @bitcast_v7f32_to_v14f16_scalar(<7 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7f32_to_v14f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s23, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s22, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v7f32_to_v14f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_4 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_3: +; VI-NEXT: s_branch .LBB17_2 +; VI-NEXT: .LBB17_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f32_to_v14f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_4 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_3: +; GFX9-NEXT: s_branch .LBB17_2 +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f32_to_v14f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_4 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_3: +; GFX11-NEXT: s_branch .LBB17_2 +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <7 x float> %a1 to <14 x half> + br label %end + +cmp.false: + %a3 = bitcast <7 x float> %a to <14 x half> + br label %end + +end: + %phi = phi <14 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x half> %phi +} + define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f16_to_v7f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v12 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GCN-NEXT: v_or_b32_e32 v0, v20, v0 -; GCN-NEXT: v_or_b32_e32 v1, v18, v1 -; GCN-NEXT: v_or_b32_e32 v2, v16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v15, v3 -; GCN-NEXT: v_or_b32_e32 v4, v14, v4 -; GCN-NEXT: v_or_b32_e32 v5, v8, v5 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v10, v6 -; GCN-NEXT: v_or_b32_e32 v4, v13, v12 -; GCN-NEXT: v_or_b32_e32 v5, v8, v11 -; GCN-NEXT: v_or_b32_e32 v6, v7, v9 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14f16_to_v7f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v3, v17, v3 +; SI-NEXT: v_or_b32_e32 v4, v15, v4 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f16_to_v7f32: ; VI: ; %bb.0: @@ -1492,7 +3068,7 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v7, 0x200 ; VI-NEXT: v_add_f16_sdwa v8, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1516,7 +3092,7 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v8 ; VI-NEXT: v_or_b32_e32 v0, v0, v7 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1548,7 +3124,7 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] @@ -1557,7 +3133,7 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: .LBB18_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1577,110 +3153,342 @@ end: ret <7 x float> %phi } +define inreg <7 x float> @bitcast_v14f16_to_v7f32_scalar(<14 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f16_to_v7f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: v_or_b32_e32 v4, v11, v4 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v14f16_to_v7f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v7, v1 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: v_add_f16_sdwa v7, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f16_to_v7f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f16_to_v7f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x half> %a, splat (half 0xH0200) + %a2 = bitcast <14 x half> %a1 to <7 x float> + br label %end + +cmp.false: + %a3 = bitcast <14 x half> %a to <7 x float> + br label %end + +end: + %phi = phi <7 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x float> %phi +} + define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i16_to_v14f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v27, v13 -; GCN-NEXT: v_mov_b32_e32 v26, v12 -; GCN-NEXT: v_mov_b32_e32 v25, v11 -; GCN-NEXT: v_mov_b32_e32 v24, v10 -; GCN-NEXT: v_mov_b32_e32 v23, v9 -; GCN-NEXT: v_mov_b32_e32 v22, v8 -; GCN-NEXT: v_mov_b32_e32 v21, v7 -; GCN-NEXT: v_mov_b32_e32 v20, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v5 -; GCN-NEXT: v_mov_b32_e32 v18, v4 -; GCN-NEXT: v_mov_b32_e32 v17, v3 -; GCN-NEXT: v_mov_b32_e32 v16, v2 -; GCN-NEXT: v_mov_b32_e32 v15, v1 -; GCN-NEXT: v_mov_b32_e32 v28, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i16_to_v14f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i16_to_v14f16: ; VI: ; %bb.0: @@ -1689,7 +3497,7 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v7, 3 ; VI-NEXT: v_add_u16_sdwa v8, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1713,7 +3521,7 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v10 ; VI-NEXT: v_or_b32_e32 v1, v1, v9 ; VI-NEXT: v_or_b32_e32 v0, v0, v8 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1744,7 +3552,7 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] @@ -1753,7 +3561,7 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1773,92 +3581,293 @@ end: ret <14 x half> %phi } +define inreg <14 x half> @bitcast_v14i16_to_v14f16_scalar(<14 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i16_to_v14f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v14i16_to_v14f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v14i16_to_v14f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_4 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_3: +; GFX9-NEXT: s_branch .LBB21_2 +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14i16_to_v14f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: s_branch .LBB21_2 +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i16> %a, splat (i16 3) + %a2 = bitcast <14 x i16> %a1 to <14 x half> + br label %end + +cmp.false: + %a3 = bitcast <14 x i16> %a to <14 x half> + br label %end + +end: + %phi = phi <14 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x half> %phi +} + define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f16_to_v14i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_or_b32_e32 v12, v12, v14 -; GCN-NEXT: v_or_b32_e32 v10, v10, v15 -; GCN-NEXT: v_or_b32_e32 v6, v6, v16 -; GCN-NEXT: v_or_b32_e32 v2, v2, v17 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14f16_to_v14i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f16_to_v14i16: ; VI: ; %bb.0: @@ -1867,7 +3876,7 @@ define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v8, 0x200 ; VI-NEXT: v_add_f16_e32 v7, 0x200, v0 @@ -1891,7 +3900,7 @@ define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v10, v2 ; VI-NEXT: v_or_b32_e32 v1, v9, v1 ; VI-NEXT: v_or_b32_e32 v0, v7, v0 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1923,7 +3932,7 @@ define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] @@ -1932,7 +3941,7 @@ define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1951,3 +3960,229 @@ end: %phi = phi <14 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <14 x i16> %phi } + +define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f16_to_v14i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v14f16_to_v14i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_add_f16_e32 v7, s16, v0 +; VI-NEXT: v_add_f16_sdwa v8, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v9, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v10, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v11, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: v_add_f16_sdwa v12, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s21, v0 +; VI-NEXT: v_add_f16_sdwa v13, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s22, v0 +; VI-NEXT: v_add_f16_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v0 +; VI-NEXT: v_or_b32_e32 v5, v5, v13 +; VI-NEXT: v_or_b32_e32 v4, v4, v12 +; VI-NEXT: v_or_b32_e32 v3, v3, v11 +; VI-NEXT: v_or_b32_e32 v2, v2, v10 +; VI-NEXT: v_or_b32_e32 v1, v1, v9 +; VI-NEXT: v_or_b32_e32 v0, v7, v8 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f16_to_v14i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f16_to_v14i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: s_branch .LBB23_2 +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x half> %a, splat (half 0xH0200) + %a2 = bitcast <14 x half> %a1 to <14 x i16> + br label %end + +cmp.false: + %a3 = bitcast <14 x half> %a to <14 x i16> + br label %end + +end: + %phi = phi <14 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i16> %phi +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FAKE16: {{.*}} +; GFX11-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index cc32c19b267bf..e46df60a93343 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -1,32 +1,31 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <8 x float> @bitcast_v8i32_to_v8f32(<8 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i32_to_v8f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i32_to_v8f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i32_to_v8f32: ; VI: ; %bb.0: @@ -106,27 +105,158 @@ end: ret <8 x float> %phi } +define inreg <8 x float> @bitcast_v8i32_to_v8f32_scalar(<8 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i32_to_v8f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v8i32_to_v8f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v8i32_to_v8f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v8i32_to_v8f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + define <8 x i32> @bitcast_v8f32_to_v8i32(<8 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f32_to_v8i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f32_to_v8i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v8i32: ; VI: ; %bb.0: @@ -201,27 +331,165 @@ end: ret <8 x i32> %phi } +define inreg <8 x i32> @bitcast_v8f32_to_v8i32_scalar(<8 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f32_to_v8i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f32_to_v8i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v8i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v8i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v7, s7, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + define <4 x i64> @bitcast_v8i32_to_v4i64(<8 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i32_to_v4i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i32_to_v4i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i32_to_v4i64: ; VI: ; %bb.0: @@ -271,7 +539,7 @@ define <4 x i64> @bitcast_v8i32_to_v4i64(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 ; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 @@ -281,7 +549,7 @@ define <4 x i64> @bitcast_v8i32_to_v4i64(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -301,27 +569,158 @@ end: ret <4 x i64> %phi } +define inreg <4 x i64> @bitcast_v8i32_to_v4i64_scalar(<8 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i32_to_v4i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v8i32_to_v4i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v8i32_to_v4i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v8i32_to_v4i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + define <8 x i32> @bitcast_v4i64_to_v8i32(<4 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i64_to_v8i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i64_to_v8i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v8i32: ; VI: ; %bb.0: @@ -371,7 +770,7 @@ define <8 x i32> @bitcast_v4i64_to_v8i32(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -383,7 +782,7 @@ define <8 x i32> @bitcast_v4i64_to_v8i32(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -403,27 +802,158 @@ end: ret <8 x i32> %phi } +define inreg <8 x i32> @bitcast_v4i64_to_v8i32_scalar(<4 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i64_to_v8i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v4i64_to_v8i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v4i64_to_v8i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v4i64_to_v8i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_3 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB7_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: s_branch .LBB7_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + define <4 x double> @bitcast_v8i32_to_v4f64(<8 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i32_to_v4f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i32_to_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i32_to_v4f64: ; VI: ; %bb.0: @@ -473,7 +1003,7 @@ define <4 x double> @bitcast_v8i32_to_v4f64(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 ; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 @@ -483,7 +1013,7 @@ define <4 x double> @bitcast_v8i32_to_v4f64(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -503,23 +1033,154 @@ end: ret <4 x double> %phi } +define inreg <4 x double> @bitcast_v8i32_to_v4f64_scalar(<8 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i32_to_v4f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v8i32_to_v4f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v8i32_to_v4f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v8i32_to_v4f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + define <8 x i32> @bitcast_v4f64_to_v8i32(<4 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f64_to_v8i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f64_to_v8i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v8i32: ; VI: ; %bb.0: @@ -528,13 +1189,13 @@ define <8 x i32> @bitcast_v4f64_to_v8i32(<4 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -545,13 +1206,13 @@ define <8 x i32> @bitcast_v4f64_to_v8i32(<4 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -563,13 +1224,13 @@ define <8 x i32> @bitcast_v4f64_to_v8i32(<4 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -589,62 +1250,184 @@ end: ret <8 x i32> %phi } +define inreg <8 x i32> @bitcast_v4f64_to_v8i32_scalar(<4 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f64_to_v8i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_4 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_3: +; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v8i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v8i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v8i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + define <16 x i16> @bitcast_v8i32_to_v16i16(<8 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i32_to_v16i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v8, v16 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i32_to_v16i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i32_to_v16i16: ; VI: ; %bb.0: @@ -694,7 +1477,7 @@ define <16 x i16> @bitcast_v8i32_to_v16i16(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 ; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 @@ -704,7 +1487,7 @@ define <16 x i16> @bitcast_v8i32_to_v16i16(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB6_2: ; %end +; GFX11-NEXT: .LBB12_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -724,104 +1507,272 @@ end: ret <16 x i16> %phi } +define inreg <16 x i16> @bitcast_v8i32_to_v16i16_scalar(<8 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i32_to_v16i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: s_lshr_b32 s7, s21, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: s_lshr_b32 s7, s21, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s7 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v8i32_to_v16i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v8i32_to_v16i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_v8i32_to_v16i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i16_to_v8i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v4 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v0, v0, v22 -; GCN-NEXT: v_or_b32_e32 v1, v1, v23 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v2, v2, v16 -; GCN-NEXT: v_or_b32_e32 v3, v3, v21 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v0, v22, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_or_b32_e32 v2, v16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v21, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i16_to_v8i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v4, v21, v4 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: v_or_b32_e32 v6, v11, v6 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i16_to_v8i32: ; VI: ; %bb.0: @@ -830,7 +1781,7 @@ define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v9, 3 ; VI-NEXT: v_add_u16_e32 v8, 3, v7 @@ -857,7 +1808,7 @@ define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v8, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v8, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -889,7 +1840,7 @@ define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] @@ -899,7 +1850,7 @@ define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: .LBB14_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -919,130 +1870,363 @@ end: ret <8 x i32> %phi } -define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i32_to_v16f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v7 -; GCN-NEXT: v_mov_b32_e32 v18, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v5 -; GCN-NEXT: v_mov_b32_e32 v20, v4 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v22, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NEXT: v_mov_b32_e32 v16, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <8 x i32> @bitcast_v16i16_to_v8i32_scalar(<16 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i16_to_v8i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v0, v9 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB15_2 ; -; VI-LABEL: bitcast_v8i32_to_v16f16: +; VI-LABEL: bitcast_v16i16_to_v8i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v16i16_to_v8i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v8i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + +define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v8i32_to_v16f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v18, v5 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v1 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i32_to_v16f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: ; %bb.2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1076,7 +2260,7 @@ define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 ; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 @@ -1086,7 +2270,7 @@ define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB8_2: ; %end +; GFX11-NEXT: .LBB16_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1106,139 +2290,327 @@ end: ret <16 x half> %phi } +define inreg <16 x half> @bitcast_v8i32_to_v16f16_scalar(<8 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i32_to_v16f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v8i32_to_v16f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v8i32_to_v16f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v8i32_to_v16f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f16_to_v8i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v14 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: v_or_b32_e32 v0, v25, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; GCN-NEXT: v_or_b32_e32 v2, v19, v2 -; GCN-NEXT: v_or_b32_e32 v3, v17, v3 -; GCN-NEXT: v_or_b32_e32 v4, v16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v9, v6 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v23 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v11, v12 -; GCN-NEXT: v_or_b32_e32 v6, v9, v13 -; GCN-NEXT: v_or_b32_e32 v7, v8, v10 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f16_to_v8i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v3, v21, v3 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v5, v17, v5 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v8i32: ; VI: ; %bb.0: @@ -1247,7 +2619,7 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v8, 0x200 ; VI-NEXT: v_add_f16_sdwa v9, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1274,7 +2646,7 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v9 ; VI-NEXT: v_or_b32_e32 v0, v0, v8 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1307,7 +2679,7 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] @@ -1317,7 +2689,7 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: .LBB18_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1337,98 +2709,352 @@ end: ret <8 x i32> %phi } +define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f16_to_v8i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 +; SI-NEXT: v_or_b32_e32 v5, v12, v5 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v16f16_to_v8i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v8, v1 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_add_f16_sdwa v8, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v8 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v8i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v8i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + define <16 x bfloat> @bitcast_v8i32_to_v16bf16(<8 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i32_to_v16bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v23, v7 -; GCN-NEXT: v_mov_b32_e32 v22, v6 -; GCN-NEXT: v_mov_b32_e32 v21, v5 -; GCN-NEXT: v_mov_b32_e32 v20, v4 -; GCN-NEXT: v_mov_b32_e32 v19, v3 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v1 -; GCN-NEXT: v_mov_b32_e32 v16, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v23 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i32_to_v16bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v7 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v5 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v1 +; SI-NEXT: v_mov_b32_e32 v16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i32_to_v16bf16: ; VI: ; %bb.0: @@ -1478,7 +3104,7 @@ define <16 x bfloat> @bitcast_v8i32_to_v16bf16(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 ; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 @@ -1488,7 +3114,7 @@ define <16 x bfloat> @bitcast_v8i32_to_v16bf16(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1508,123 +3134,311 @@ end: ret <16 x bfloat> %phi } +define inreg <16 x bfloat> @bitcast_v8i32_to_v16bf16_scalar(<8 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i32_to_v16bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_and_b32 s8, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s22, 16 +; SI-NEXT: s_and_b32 s10, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s21, 16 +; SI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s20, 16 +; SI-NEXT: s_and_b32 s14, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s19, 16 +; SI-NEXT: s_and_b32 s24, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s18, 16 +; SI-NEXT: s_and_b32 s26, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s17, 16 +; SI-NEXT: s_and_b32 s28, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_and_b32 s8, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s22, 16 +; SI-NEXT: s_and_b32 s10, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s21, 16 +; SI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s20, 16 +; SI-NEXT: s_and_b32 s14, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s19, 16 +; SI-NEXT: s_and_b32 s24, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s18, 16 +; SI-NEXT: s_and_b32 s26, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s17, 16 +; SI-NEXT: s_and_b32 s28, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s16, 16 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s29 +; SI-NEXT: v_mov_b32_e32 v1, s28 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: v_mov_b32_e32 v3, s26 +; SI-NEXT: v_mov_b32_e32 v4, s25 +; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: v_mov_b32_e32 v6, s15 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v8, s13 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v10, s11 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v12, s9 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v14, s7 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v8i32_to_v16bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v8i32_to_v16bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v8i32_to_v16bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_3 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB21_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: s_branch .LBB21_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v16bf16_to_v8i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_4 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB11_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GCN-NEXT: v_alignbit_b32 v0, v0, v26, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v24, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GCN-NEXT: v_alignbit_b32 v2, v2, v20, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; GCN-NEXT: v_alignbit_b32 v6, v6, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: .LBB11_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v6, v13, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v9, v8, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16bf16_to_v8i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_4 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB22_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: .LBB22_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16bf16_to_v8i32: ; VI: ; %bb.0: @@ -1633,7 +3447,7 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -1780,7 +3594,7 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1791,7 +3605,7 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -1915,7 +3729,7 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc ; GFX9-NEXT: v_perm_b32 v0, v0, v8, s7 -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1927,7 +3741,7 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2071,7 +3885,7 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v11 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v14, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v12, v0 -; GFX11-TRUE16-NEXT: .LBB11_2: ; %end +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2083,7 +3897,7 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v7 @@ -2214,7 +4028,7 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v13, v18, vcc_lo ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v12, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB11_2: ; %end +; GFX11-FAKE16-NEXT: .LBB22_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2234,222 +4048,875 @@ end: ret <8 x i32> %phi } -define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i32_to_v32i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v28, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v20, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: .LBB12_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: .LBB12_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v8, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16bf16_to_v8i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB23_2 ; -; VI-LABEL: bitcast_v8i32_to_v32i8: +; VI-LABEL: bitcast_v16bf16_to_v8i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v34, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr2 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr10 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr14 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: .LBB12_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v35 -; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v34 -; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v33 -; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: .LBB12_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mov_b32_e32 v8, v34 -; VI-NEXT: v_mov_b32_e32 v12, v35 -; VI-NEXT: v_mov_b32_e32 v16, v32 -; VI-NEXT: v_mov_b32_e32 v20, v33 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v28, v7 -; VI-NEXT: v_mov_b32_e32 v1, v38 -; VI-NEXT: v_mov_b32_e32 v6, v37 -; VI-NEXT: v_mov_b32_e32 v7, v36 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v8i32_to_v32i8: +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v8i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v2 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v1 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v9 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v9, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v9 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_bfe_u32 v10, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_and_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v8i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s8, s7, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-NEXT: s_and_b32 s8, s6, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_and_b32 s7, s5, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1 +; GFX11-NEXT: s_and_b32 s5, s4, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v11, v6 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v9, v10 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s3 +; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v6, v8 +; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v10 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v8 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v3, v9, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s2 +; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v9 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_bfe_u32 v11, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_bfe_u32 v16, v14, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s2 +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s1 +; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v15, v9, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v1, v10, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v0, v12, 16, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: s_branch .LBB23_2 +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + +define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v8i32_to_v32i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: .LBB24_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: .LBB24_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i32_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v35 +; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v8, v34 +; VI-NEXT: v_mov_b32_e32 v12, v35 +; VI-NEXT: v_mov_b32_e32 v16, v32 +; VI-NEXT: v_mov_b32_e32 v20, v33 +; VI-NEXT: v_mov_b32_e32 v24, v6 +; VI-NEXT: v_mov_b32_e32 v28, v7 +; VI-NEXT: v_mov_b32_e32 v1, v38 +; VI-NEXT: v_mov_b32_e32 v6, v37 +; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i32_to_v32i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v33, v5 @@ -2483,7 +4950,7 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -2509,9 +4976,9 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: .LBB12_2: ; %Flow +; GFX9-NEXT: .LBB24_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_4 +; GFX9-NEXT: s_cbranch_execz .LBB24_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 @@ -2545,7 +5012,7 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: .LBB12_4: ; %end +; GFX9-NEXT: .LBB24_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 @@ -2585,7 +5052,7 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[32:33] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[26:27] @@ -2603,9 +5070,9 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[18:19] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB12_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB24_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 3, v33 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v32 @@ -2631,7 +5098,7 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB12_4: ; %end +; GFX11-TRUE16-NEXT: .LBB24_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -2689,7 +5156,7 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 @@ -2715,9 +5182,9 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 3, v39 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 3, v37 @@ -2751,7 +5218,7 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB12_4: ; %end +; GFX11-FAKE16-NEXT: .LBB24_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 @@ -2779,228 +5246,856 @@ end: ret <32 x i8> %phi } +define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i32_to_v32i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 +; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 +; SI-NEXT: v_alignbit_b32 v25, s23, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 +; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, s21, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s23, 24 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 8 +; SI-NEXT: s_lshr_b32 s8, s21, 24 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 8 +; SI-NEXT: s_lshr_b32 s11, s19, 24 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 8 +; SI-NEXT: s_lshr_b32 s15, s17, 24 +; SI-NEXT: s_lshr_b32 s24, s17, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 +; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 +; SI-NEXT: v_alignbit_b32 v25, s23, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 +; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, s21, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s23, 24 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 8 +; SI-NEXT: s_lshr_b32 s8, s21, 24 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 8 +; SI-NEXT: s_lshr_b32 s11, s19, 24 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 8 +; SI-NEXT: s_lshr_b32 s15, s17, 24 +; SI-NEXT: s_lshr_b32 s24, s17, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 8 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s25 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_mov_b32_e32 v13, s14 +; SI-NEXT: v_mov_b32_e32 v14, s13 +; SI-NEXT: v_mov_b32_e32 v15, s11 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_mov_b32_e32 v21, s12 +; SI-NEXT: v_mov_b32_e32 v22, s10 +; SI-NEXT: v_mov_b32_e32 v23, s8 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v28, s23 +; SI-NEXT: v_mov_b32_e32 v29, s9 +; SI-NEXT: v_mov_b32_e32 v30, s7 +; SI-NEXT: v_mov_b32_e32 v31, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB25_2 +; +; VI-LABEL: bitcast_v8i32_to_v32i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s23, 24 +; VI-NEXT: s_lshr_b32 s15, s23, 16 +; VI-NEXT: s_lshr_b32 s24, s23, 8 +; VI-NEXT: s_lshr_b32 s25, s22, 16 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: s_lshr_b32 s27, s21, 24 +; VI-NEXT: s_lshr_b32 s28, s21, 16 +; VI-NEXT: s_lshr_b32 s29, s21, 8 +; VI-NEXT: s_lshr_b32 s40, s20, 16 +; VI-NEXT: s_lshr_b32 s41, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s43, s19, 16 +; VI-NEXT: s_lshr_b32 s44, s19, 8 +; VI-NEXT: s_lshr_b32 s45, s18, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: s_lshr_b32 s47, s17, 24 +; VI-NEXT: s_lshr_b32 s56, s17, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 8 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s14, s23, 24 +; VI-NEXT: s_lshr_b32 s15, s23, 16 +; VI-NEXT: s_lshr_b32 s24, s23, 8 +; VI-NEXT: s_lshr_b32 s25, s22, 16 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: s_lshr_b32 s27, s21, 24 +; VI-NEXT: s_lshr_b32 s28, s21, 16 +; VI-NEXT: s_lshr_b32 s29, s21, 8 +; VI-NEXT: s_lshr_b32 s40, s20, 16 +; VI-NEXT: s_lshr_b32 s41, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s43, s19, 16 +; VI-NEXT: s_lshr_b32 s44, s19, 8 +; VI-NEXT: s_lshr_b32 s45, s18, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: s_lshr_b32 s47, s17, 24 +; VI-NEXT: s_lshr_b32 s56, s17, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 8 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s59 +; VI-NEXT: v_mov_b32_e32 v2, s58 +; VI-NEXT: v_mov_b32_e32 v3, s10 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s57 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s47 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s46 +; VI-NEXT: v_mov_b32_e32 v10, s45 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s44 +; VI-NEXT: v_mov_b32_e32 v14, s43 +; VI-NEXT: v_mov_b32_e32 v15, s42 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v17, s41 +; VI-NEXT: v_mov_b32_e32 v18, s40 +; VI-NEXT: v_mov_b32_e32 v19, s6 +; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_mov_b32_e32 v21, s29 +; VI-NEXT: v_mov_b32_e32 v22, s28 +; VI-NEXT: v_mov_b32_e32 v23, s27 +; VI-NEXT: v_mov_b32_e32 v24, s22 +; VI-NEXT: v_mov_b32_e32 v25, s26 +; VI-NEXT: v_mov_b32_e32 v26, s25 +; VI-NEXT: v_mov_b32_e32 v27, s4 +; VI-NEXT: v_mov_b32_e32 v28, s23 +; VI-NEXT: v_mov_b32_e32 v29, s24 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v31, s14 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v8i32_to_v32i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s23, 24 +; GFX9-NEXT: s_lshr_b32 s15, s23, 16 +; GFX9-NEXT: s_lshr_b32 s24, s23, 8 +; GFX9-NEXT: s_lshr_b32 s25, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: s_lshr_b32 s27, s21, 24 +; GFX9-NEXT: s_lshr_b32 s28, s21, 16 +; GFX9-NEXT: s_lshr_b32 s29, s21, 8 +; GFX9-NEXT: s_lshr_b32 s40, s20, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 8 +; GFX9-NEXT: s_lshr_b32 s42, s19, 24 +; GFX9-NEXT: s_lshr_b32 s43, s19, 16 +; GFX9-NEXT: s_lshr_b32 s44, s19, 8 +; GFX9-NEXT: s_lshr_b32 s45, s18, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: s_lshr_b32 s47, s17, 24 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s17, 8 +; GFX9-NEXT: s_lshr_b32 s58, s16, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s14, s23, 24 +; GFX9-NEXT: s_lshr_b32 s15, s23, 16 +; GFX9-NEXT: s_lshr_b32 s24, s23, 8 +; GFX9-NEXT: s_lshr_b32 s25, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: s_lshr_b32 s27, s21, 24 +; GFX9-NEXT: s_lshr_b32 s28, s21, 16 +; GFX9-NEXT: s_lshr_b32 s29, s21, 8 +; GFX9-NEXT: s_lshr_b32 s40, s20, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 8 +; GFX9-NEXT: s_lshr_b32 s42, s19, 24 +; GFX9-NEXT: s_lshr_b32 s43, s19, 16 +; GFX9-NEXT: s_lshr_b32 s44, s19, 8 +; GFX9-NEXT: s_lshr_b32 s45, s18, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: s_lshr_b32 s47, s17, 24 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s17, 8 +; GFX9-NEXT: s_lshr_b32 s58, s16, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 8 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s59 +; GFX9-NEXT: v_mov_b32_e32 v2, s58 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s57 +; GFX9-NEXT: v_mov_b32_e32 v6, s56 +; GFX9-NEXT: v_mov_b32_e32 v7, s47 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v9, s46 +; GFX9-NEXT: v_mov_b32_e32 v10, s45 +; GFX9-NEXT: v_mov_b32_e32 v11, s8 +; GFX9-NEXT: v_mov_b32_e32 v12, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s44 +; GFX9-NEXT: v_mov_b32_e32 v14, s43 +; GFX9-NEXT: v_mov_b32_e32 v15, s42 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s41 +; GFX9-NEXT: v_mov_b32_e32 v18, s40 +; GFX9-NEXT: v_mov_b32_e32 v19, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s21 +; GFX9-NEXT: v_mov_b32_e32 v21, s29 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 +; GFX9-NEXT: v_mov_b32_e32 v24, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s26 +; GFX9-NEXT: v_mov_b32_e32 v26, s25 +; GFX9-NEXT: v_mov_b32_e32 v27, s4 +; GFX9-NEXT: v_mov_b32_e32 v28, s23 +; GFX9-NEXT: v_mov_b32_e32 v29, s24 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v31, s14 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr24 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-TRUE16-LABEL: bitcast_v8i32_to_v32i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB25_3 +; GFX11-TRUE16-NEXT: .LBB25_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-TRUE16-NEXT: .LBB25_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s12 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB25_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB25_2 +; +; GFX11-FAKE16-LABEL: bitcast_v8i32_to_v32i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB25_3 +; GFX11-FAKE16-NEXT: .LBB25_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: .LBB25_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s45 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s44 :: v_dual_mov_b32 v3, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s41 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s29 :: v_dual_mov_b32 v11, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v13, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s27 :: v_dual_mov_b32 v15, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s17 :: v_dual_mov_b32 v21, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s15 :: v_dual_mov_b32 v27, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s19 :: v_dual_mov_b32 v29, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB25_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: s_branch .LBB25_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i8_to_v8i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_4 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB13_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v0, v0, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v12, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v16, v16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v20, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v9, v13, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v13, v17, v18 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v15, v19, v21 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: v_or_b32_e32 v4, v8, v9 -; GCN-NEXT: v_or_b32_e32 v5, v10, v11 -; GCN-NEXT: v_or_b32_e32 v6, v12, v13 -; GCN-NEXT: v_or_b32_e32 v7, v14, v15 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: .LBB13_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v0, v37, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v39, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v48, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v49, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v21, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v16, v23, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v25, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v13, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v16 -; GCN-NEXT: v_or_b32_e32 v13, v17, v18 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x300, v20 -; GCN-NEXT: v_or_b32_e32 v15, v19, v21 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i8_to_v8i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_4 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB26_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: .LBB26_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v5, v19, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v6, v15, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v8i32: ; VI: ; %bb.0: @@ -3033,14 +6128,14 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: s_cbranch_execnz .LBB26_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB13_4 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB26_4 +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB13_3: ; %cmp.false +; VI-NEXT: .LBB26_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3098,8 +6193,8 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 -; VI-NEXT: .LBB13_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: .LBB26_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v31 ; VI-NEXT: v_add_u16_e32 v1, 3, v32 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3191,14 +6286,14 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: s_cbranch_execnz .LBB26_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB13_4 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB26_4 +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB13_3: ; %cmp.false +; GFX9-NEXT: .LBB26_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3256,8 +6351,8 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 -; GFX9-NEXT: .LBB13_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: .LBB26_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3357,14 +6452,14 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-TRUE16-NEXT: .LBB13_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_4 +; GFX11-TRUE16-NEXT: .LBB26_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB13_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h @@ -3456,8 +6551,8 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-TRUE16-NEXT: .LBB13_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 @@ -3582,14 +6677,14 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-FAKE16-NEXT: .LBB13_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_4 +; GFX11-FAKE16-NEXT: .LBB26_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB13_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 @@ -3679,8 +6774,8 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-FAKE16-NEXT: .LBB13_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -3788,27 +6883,968 @@ end: ret <8 x i32> %phi } +define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i8_to_v8i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v17 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v23, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v21 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v32i8_to_v8i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v23, v5 +; VI-NEXT: v_mov_b32_e32 v24, v4 +; VI-NEXT: v_mov_b32_e32 v21, v2 +; VI-NEXT: v_mov_b32_e32 v20, v1 +; VI-NEXT: v_mov_b32_e32 v19, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; VI-NEXT: v_or_b32_sdwa v0, v21, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 +; VI-NEXT: v_or_b32_sdwa v2, v22, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s19, 24 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v19 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s28, 0xff +; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v13 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v21 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; VI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v17 +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v23 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_or_b32_sdwa v9, v25, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v5, v5, v8 +; VI-NEXT: v_or_b32_sdwa v8, v18, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v3, v3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x300, v13 +; VI-NEXT: v_or_b32_e32 v4, v4, v12 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v8 +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v32i8_to_v8i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v4 +; GFX9-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v17 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v19 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v8i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB27_3 +; GFX11-TRUE16-NEXT: .LBB27_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v21 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v8 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: .LBB27_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB27_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-TRUE16-NEXT: s_branch .LBB27_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v8i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v6 :: v_dual_mov_b32 v17, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v15, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v14 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v20 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB27_3 +; GFX11-FAKE16-NEXT: .LBB27_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v16 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v17 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v18 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v14, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v9, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v15 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v20, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: .LBB27_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB27_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-FAKE16-NEXT: s_branch .LBB27_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + define <4 x i64> @bitcast_v8f32_to_v4i64(<8 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f32_to_v4i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB14_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f32_to_v4i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v4i64: ; VI: ; %bb.0: @@ -3883,27 +7919,165 @@ end: ret <4 x i64> %phi } +define inreg <4 x i64> @bitcast_v8f32_to_v4i64_scalar(<8 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f32_to_v4i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB29_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB29_4 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_3: +; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f32_to_v4i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v4i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v4i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v7, s7, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + define <8 x float> @bitcast_v4i64_to_v8f32(<4 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i64_to_v8f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i64_to_v8f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v8f32: ; VI: ; %bb.0: @@ -3953,7 +8127,7 @@ define <8 x float> @bitcast_v4i64_to_v8f32(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -3965,7 +8139,7 @@ define <8 x float> @bitcast_v4i64_to_v8f32(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB15_2: ; %end +; GFX11-NEXT: .LBB30_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3985,27 +8159,158 @@ end: ret <8 x float> %phi } +define inreg <8 x float> @bitcast_v4i64_to_v8f32_scalar(<4 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i64_to_v8f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v4i64_to_v8f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v4i64_to_v8f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-LABEL: bitcast_v4i64_to_v8f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB31_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: s_branch .LBB31_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + define <4 x double> @bitcast_v8f32_to_v4f64(<8 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f32_to_v4f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f32_to_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v4f64: ; VI: ; %bb.0: @@ -4080,23 +8385,161 @@ end: ret <4 x double> %phi } +define inreg <4 x double> @bitcast_v8f32_to_v4f64_scalar(<8 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f32_to_v4f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB33_4 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_3: +; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f32_to_v4f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_3: +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v4f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v4f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v7, s7, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + define <8 x float> @bitcast_v4f64_to_v8f32(<4 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f64_to_v8f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f64_to_v8f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v8f32: ; VI: ; %bb.0: @@ -4105,13 +8548,13 @@ define <8 x float> @bitcast_v4f64_to_v8f32(<4 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4122,13 +8565,13 @@ define <8 x float> @bitcast_v4f64_to_v8f32(<4 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: .LBB34_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4140,13 +8583,13 @@ define <8 x float> @bitcast_v4f64_to_v8f32(<4 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: .LBB34_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4166,62 +8609,184 @@ end: ret <8 x float> %phi } +define inreg <8 x float> @bitcast_v4f64_to_v8f32_scalar(<4 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f64_to_v8f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB35_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB35_4 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_3: +; SI-NEXT: s_branch .LBB35_2 +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v8f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB35_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: +; VI-NEXT: s_branch .LBB35_2 +; VI-NEXT: .LBB35_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v8f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: +; GFX9-NEXT: s_branch .LBB35_2 +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v8f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + define <16 x i16> @bitcast_v8f32_to_v16i16(<8 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f32_to_v16i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB18_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB18_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v8, v16 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f32_to_v16i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB36_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v16i16: ; VI: ; %bb.0: @@ -4296,117 +8861,288 @@ end: ret <16 x i16> %phi } -define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i16_to_v8f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v4 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_4 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB19_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v0, v0, v22 -; GCN-NEXT: v_or_b32_e32 v1, v1, v23 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v2, v2, v16 -; GCN-NEXT: v_or_b32_e32 v3, v3, v21 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: .LBB19_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v0, v22, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_or_b32_e32 v2, v16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v21, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x i16> @bitcast_v8f32_to_v16i16_scalar(<8 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f32_to_v16i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB37_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB37_4 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: s_branch .LBB37_2 +; SI-NEXT: .LBB37_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v16i16_to_v8f32: +; VI-LABEL: bitcast_v8f32_to_v16i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v9, 3 -; VI-NEXT: v_add_u16_e32 v8, 3, v7 -; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB37_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_4 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_3: +; VI-NEXT: s_branch .LBB37_2 +; VI-NEXT: .LBB37_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v16i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_4 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_3: +; GFX9-NEXT: s_branch .LBB37_2 +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v16i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v7, s7, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: s_branch .LBB37_2 +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + +define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v16i16_to_v8f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB38_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB38_4 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB38_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: .LBB38_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v4, v21, v4 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: v_or_b32_e32 v6, v11, v6 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i16_to_v8f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v9, 3 +; VI-NEXT: v_add_u16_e32 v8, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v7, v8, v7 ; VI-NEXT: v_add_u16_e32 v8, 3, v6 ; VI-NEXT: v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -4429,7 +9165,7 @@ define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v8, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v8, v0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4461,7 +9197,7 @@ define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] @@ -4471,7 +9207,7 @@ define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4491,114 +9227,347 @@ end: ret <8 x float> %phi } +define inreg <8 x float> @bitcast_v16i16_to_v8f32_scalar(<16 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i16_to_v8f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v0, v9 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v16i16_to_v8f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v16i16_to_v8f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_4 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_3: +; GFX9-NEXT: s_branch .LBB39_2 +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v8f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: s_branch .LBB39_2 +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + define <16 x half> @bitcast_v8f32_to_v16f16(<8 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f32_to_v16f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v7 -; GCN-NEXT: v_mov_b32_e32 v18, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v5 -; GCN-NEXT: v_mov_b32_e32 v20, v4 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v22, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NEXT: v_mov_b32_e32 v16, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB20_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB20_4 -; GCN-NEXT: .LBB20_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB20_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: .LBB20_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f32_to_v16f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v18, v5 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v1 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB40_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB40_4 +; SI-NEXT: .LBB40_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB40_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: .LBB40_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v16f16: ; VI: ; %bb.0: @@ -4673,139 +9642,333 @@ end: ret <16 x half> %phi } +define inreg <16 x half> @bitcast_v8f32_to_v16f16_scalar(<8 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f32_to_v16f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v8f32_to_v16f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB41_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_4 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_3: +; VI-NEXT: s_branch .LBB41_2 +; VI-NEXT: .LBB41_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v16f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_4 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_3: +; GFX9-NEXT: s_branch .LBB41_2 +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v16f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_4 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v7, s7, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_3: +; GFX11-NEXT: s_branch .LBB41_2 +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f16_to_v8f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v14 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_4 -; GCN-NEXT: .LBB21_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB21_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: v_or_b32_e32 v0, v25, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; GCN-NEXT: v_or_b32_e32 v2, v19, v2 -; GCN-NEXT: v_or_b32_e32 v3, v17, v3 -; GCN-NEXT: v_or_b32_e32 v4, v16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v9, v6 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: .LBB21_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v23 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v11, v12 -; GCN-NEXT: v_or_b32_e32 v6, v9, v13 -; GCN-NEXT: v_or_b32_e32 v7, v8, v10 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f16_to_v8f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_4 +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB42_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v3, v21, v3 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v5, v17, v5 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: .LBB42_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v8f32: ; VI: ; %bb.0: @@ -4814,7 +9977,7 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v8, 0x200 ; VI-NEXT: v_add_f16_sdwa v9, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -4841,7 +10004,7 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v9 ; VI-NEXT: v_or_b32_e32 v0, v0, v8 -; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: .LBB42_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4874,7 +10037,7 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] @@ -4884,7 +10047,7 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB21_2: ; %end +; GFX11-NEXT: .LBB42_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4904,98 +10067,352 @@ end: ret <8 x float> %phi } +define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f16_to_v8f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 +; SI-NEXT: v_or_b32_e32 v5, v12, v5 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v16f16_to_v8f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB43_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_4 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v8, v1 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_add_f16_sdwa v8, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v8 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_3: +; VI-NEXT: s_branch .LBB43_2 +; VI-NEXT: .LBB43_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v8f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_4 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_3: +; GFX9-NEXT: s_branch .LBB43_2 +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v8f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: +; GFX11-NEXT: s_branch .LBB43_2 +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + define <16 x bfloat> @bitcast_v8f32_to_v16bf16(<8 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f32_to_v16bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v23, v7 -; GCN-NEXT: v_mov_b32_e32 v22, v6 -; GCN-NEXT: v_mov_b32_e32 v21, v5 -; GCN-NEXT: v_mov_b32_e32 v20, v4 -; GCN-NEXT: v_mov_b32_e32 v19, v3 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v1 -; GCN-NEXT: v_mov_b32_e32 v16, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_4 -; GCN-NEXT: .LBB22_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB22_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v23 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: .LBB22_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f32_to_v16bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v7 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v5 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v1 +; SI-NEXT: v_mov_b32_e32 v16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_4 +; SI-NEXT: .LBB44_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB44_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: .LBB44_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v23 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v16bf16: ; VI: ; %bb.0: @@ -5070,123 +10487,318 @@ end: ret <16 x bfloat> %phi } +define inreg <16 x bfloat> @bitcast_v8f32_to_v16bf16_scalar(<8 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f32_to_v16bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB45_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_and_b32 s8, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s22, 16 +; SI-NEXT: s_and_b32 s10, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s21, 16 +; SI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s20, 16 +; SI-NEXT: s_and_b32 s14, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s19, 16 +; SI-NEXT: s_and_b32 s24, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s18, 16 +; SI-NEXT: s_and_b32 s26, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s17, 16 +; SI-NEXT: s_and_b32 s28, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB45_4 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_3: +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB45_2 +; SI-NEXT: .LBB45_4: +; SI-NEXT: v_mov_b32_e32 v0, s29 +; SI-NEXT: v_mov_b32_e32 v1, s28 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: v_mov_b32_e32 v3, s26 +; SI-NEXT: v_mov_b32_e32 v4, s25 +; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: v_mov_b32_e32 v6, s15 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v8, s13 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v10, s11 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v12, s9 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v14, s7 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f32_to_v16bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB45_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_4 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_3: +; VI-NEXT: s_branch .LBB45_2 +; VI-NEXT: .LBB45_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v16bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_4 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_3: +; GFX9-NEXT: s_branch .LBB45_2 +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v16bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_4 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v7, s7, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_3: +; GFX11-NEXT: s_branch .LBB45_2 +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v16bf16_to_v8f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_4 -; GCN-NEXT: .LBB23_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB23_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GCN-NEXT: v_alignbit_b32 v0, v0, v26, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v24, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GCN-NEXT: v_alignbit_b32 v2, v2, v20, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; GCN-NEXT: v_alignbit_b32 v6, v6, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: .LBB23_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v6, v13, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v9, v8, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16bf16_to_v8f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_4 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB46_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: .LBB46_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16bf16_to_v8f32: ; VI: ; %bb.0: @@ -5195,7 +10807,7 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -5342,7 +10954,7 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5353,7 +10965,7 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -5477,7 +11089,7 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc ; GFX9-NEXT: v_perm_b32 v0, v0, v8, s7 -; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: .LBB46_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5489,7 +11101,7 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5633,7 +11245,7 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v11 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v14, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v12, v0 -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5645,7 +11257,7 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v7 @@ -5776,7 +11388,7 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v13, v18, vcc_lo ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v12, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5796,179 +11408,832 @@ end: ret <8 x float> %phi } -define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f32_to_v32i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v28, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v20, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v8, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16bf16_to_v8f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB47_2 ; -; VI-LABEL: bitcast_v8f32_to_v32i8: +; VI-LABEL: bitcast_v16bf16_to_v8f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v34, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr2 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr10 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr14 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB47_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: .LBB24_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: v_add_f32_e32 v35, 1.0, v35 -; VI-NEXT: v_add_f32_e32 v34, 1.0, v34 +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v8f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v2 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v1 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v9 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v9, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v9 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_bfe_u32 v10, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_and_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v8f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s8, s7, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-NEXT: s_and_b32 s8, s6, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_and_b32 s7, s5, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1 +; GFX11-NEXT: s_and_b32 s5, s4, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v11, v6 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v9, v10 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s3 +; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v6, v8 +; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v10 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v8 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v3, v9, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s2 +; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v9 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_bfe_u32 v11, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_bfe_u32 v16, v14, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s2 +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s1 +; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v15, v9, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v1, v10, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v0, v12, 16, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_3: +; GFX11-NEXT: s_branch .LBB47_2 +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + +define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v8f32_to_v32i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f32_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: .LBB48_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v35, 1.0, v35 +; VI-NEXT: v_add_f32_e32 v34, 1.0, v34 ; VI-NEXT: v_add_f32_e32 v33, 1.0, v33 ; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 @@ -5997,7 +12262,7 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v1 ; VI-NEXT: v_mov_b32_e32 v8, v34 @@ -6045,7 +12310,7 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -6071,9 +12336,9 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 @@ -6107,7 +12372,7 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 @@ -6147,7 +12412,7 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[32:33] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[26:27] @@ -6165,9 +12430,9 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[18:19] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v32, 1.0, v32 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 1.0, v33 :: v_dual_add_f32 v10, 1.0, v10 @@ -6191,7 +12456,7 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB24_4: ; %end +; GFX11-TRUE16-NEXT: .LBB48_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -6249,7 +12514,7 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 @@ -6275,9 +12540,9 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v39, 1.0, v39 :: v_dual_add_f32 v32, 1.0, v32 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 1.0, v37 :: v_dual_add_f32 v34, 1.0, v34 @@ -6309,7 +12574,7 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 @@ -6337,268 +12602,937 @@ end: ret <32 x i8> %phi } -define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i8_to_v8f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_4 -; GCN-NEXT: .LBB25_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB25_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v0, v0, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v12, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v16, v16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v20, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v9, v13, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v13, v17, v18 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v15, v19, v21 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: v_or_b32_e32 v4, v8, v9 -; GCN-NEXT: v_or_b32_e32 v5, v10, v11 -; GCN-NEXT: v_or_b32_e32 v6, v12, v13 -; GCN-NEXT: v_or_b32_e32 v7, v14, v15 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: .LBB25_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v0, v37, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v39, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v48, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v49, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v21, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v16, v23, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v25, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v13, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v16 -; GCN-NEXT: v_or_b32_e32 v13, v17, v18 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x300, v20 -; GCN-NEXT: v_or_b32_e32 v15, v19, v21 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f32_to_v32i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB49_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 +; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 +; SI-NEXT: v_alignbit_b32 v25, s23, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 +; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, s21, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s15, s23, 24 +; SI-NEXT: s_lshr_b32 s24, s23, 16 +; SI-NEXT: s_lshr_b32 s25, s23, 8 +; SI-NEXT: s_lshr_b32 s12, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 8 +; SI-NEXT: s_lshr_b32 s9, s19, 24 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB49_4 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v4, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s22, 1.0 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: s_branch .LBB49_2 +; SI-NEXT: .LBB49_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v28, s23 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v13, s11 +; SI-NEXT: v_mov_b32_e32 v14, s10 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v21, s14 +; SI-NEXT: v_mov_b32_e32 v22, s13 +; SI-NEXT: v_mov_b32_e32 v23, s12 +; SI-NEXT: v_mov_b32_e32 v29, s25 +; SI-NEXT: v_mov_b32_e32 v30, s24 +; SI-NEXT: v_mov_b32_e32 v31, s15 +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v32i8_to_v8f32: +; VI-LABEL: bitcast_v8f32_to_v32i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v32, v2 -; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; VI-NEXT: v_mov_b32_e32 v34, v6 -; VI-NEXT: v_mov_b32_e32 v33, v4 -; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v11 -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v13 -; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v15 -; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v17 -; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v19 -; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v23 -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v27 -; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB49_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s23, 24 +; VI-NEXT: s_lshr_b32 s15, s23, 16 +; VI-NEXT: s_lshr_b32 s25, s23, 8 +; VI-NEXT: s_lshr_b32 s24, s22, 16 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: s_lshr_b32 s27, s21, 24 +; VI-NEXT: s_lshr_b32 s28, s21, 16 +; VI-NEXT: s_lshr_b32 s40, s21, 8 +; VI-NEXT: s_lshr_b32 s29, s20, 16 +; VI-NEXT: s_lshr_b32 s41, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s43, s19, 16 +; VI-NEXT: s_lshr_b32 s45, s19, 8 +; VI-NEXT: s_lshr_b32 s44, s18, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: s_lshr_b32 s47, s17, 24 +; VI-NEXT: s_lshr_b32 s56, s17, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 8 +; VI-NEXT: s_lshr_b32 s57, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB49_4 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v17, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v16, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v25, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v24, s22, 1.0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v25 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; VI-NEXT: s_branch .LBB49_5 +; VI-NEXT: .LBB49_3: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB49_2 +; VI-NEXT: .LBB49_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s19 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v24, s22 +; VI-NEXT: v_mov_b32_e32 v25, s23 +; VI-NEXT: v_mov_b32_e32 v35, s59 +; VI-NEXT: v_mov_b32_e32 v2, s57 +; VI-NEXT: v_mov_b32_e32 v5, s58 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s47 +; VI-NEXT: v_mov_b32_e32 v34, s46 +; VI-NEXT: v_mov_b32_e32 v10, s44 +; VI-NEXT: v_mov_b32_e32 v13, s45 +; VI-NEXT: v_mov_b32_e32 v14, s43 +; VI-NEXT: v_mov_b32_e32 v15, s42 +; VI-NEXT: v_mov_b32_e32 v33, s41 +; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: v_mov_b32_e32 v21, s40 +; VI-NEXT: v_mov_b32_e32 v22, s28 +; VI-NEXT: v_mov_b32_e32 v23, s27 +; VI-NEXT: v_mov_b32_e32 v32, s26 +; VI-NEXT: v_mov_b32_e32 v26, s24 +; VI-NEXT: v_mov_b32_e32 v29, s25 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v31, s14 +; VI-NEXT: v_mov_b32_e32 v27, s10 +; VI-NEXT: v_mov_b32_e32 v19, s8 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB49_5: ; %end +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v12, v9 +; VI-NEXT: v_mov_b32_e32 v20, v17 +; VI-NEXT: v_mov_b32_e32 v28, v25 +; VI-NEXT: v_mov_b32_e32 v1, v35 +; VI-NEXT: v_mov_b32_e32 v9, v34 +; VI-NEXT: v_mov_b32_e32 v17, v33 +; VI-NEXT: v_mov_b32_e32 v25, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v32i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s23, 24 +; GFX9-NEXT: s_lshr_b32 s15, s23, 16 +; GFX9-NEXT: s_lshr_b32 s25, s23, 8 +; GFX9-NEXT: s_lshr_b32 s24, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: s_lshr_b32 s27, s21, 24 +; GFX9-NEXT: s_lshr_b32 s28, s21, 16 +; GFX9-NEXT: s_lshr_b32 s40, s21, 8 +; GFX9-NEXT: s_lshr_b32 s29, s20, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 8 +; GFX9-NEXT: s_lshr_b32 s42, s19, 24 +; GFX9-NEXT: s_lshr_b32 s43, s19, 16 +; GFX9-NEXT: s_lshr_b32 s45, s19, 8 +; GFX9-NEXT: s_lshr_b32 s44, s18, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: s_lshr_b32 s47, s17, 24 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 8 +; GFX9-NEXT: s_lshr_b32 s57, s16, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v17, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v16, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v25, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v24, s22, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; GFX9-NEXT: s_branch .LBB49_5 +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr24 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v9, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v24, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v35, s59 +; GFX9-NEXT: v_mov_b32_e32 v2, s57 +; GFX9-NEXT: v_mov_b32_e32 v5, s58 +; GFX9-NEXT: v_mov_b32_e32 v6, s56 +; GFX9-NEXT: v_mov_b32_e32 v7, s47 +; GFX9-NEXT: v_mov_b32_e32 v34, s46 +; GFX9-NEXT: v_mov_b32_e32 v10, s44 +; GFX9-NEXT: v_mov_b32_e32 v13, s45 +; GFX9-NEXT: v_mov_b32_e32 v14, s43 +; GFX9-NEXT: v_mov_b32_e32 v15, s42 +; GFX9-NEXT: v_mov_b32_e32 v33, s41 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 +; GFX9-NEXT: v_mov_b32_e32 v21, s40 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 +; GFX9-NEXT: v_mov_b32_e32 v32, s26 +; GFX9-NEXT: v_mov_b32_e32 v26, s24 +; GFX9-NEXT: v_mov_b32_e32 v29, s25 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v31, s14 +; GFX9-NEXT: v_mov_b32_e32 v27, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s8 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB49_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-NEXT: v_mov_b32_e32 v28, v25 +; GFX9-NEXT: v_mov_b32_e32 v1, v35 +; GFX9-NEXT: v_mov_b32_e32 v9, v34 +; GFX9-NEXT: v_mov_b32_e32 v17, v33 +; GFX9-NEXT: v_mov_b32_e32 v25, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v8f32_to_v32i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, s3, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, s17, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v28, s19, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v27, s18, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, s16, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, s2, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s0, 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[32:33], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB49_5 +; GFX11-TRUE16-NEXT: .LBB49_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: .LBB49_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v8f32_to_v32i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v39, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v37, s3, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v35, s17, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v33, s19, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, s18, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, s16, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v36, s2, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v38, s0, 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-FAKE16-NEXT: s_branch .LBB49_5 +; GFX11-FAKE16-NEXT: .LBB49_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s0 :: v_dual_mov_b32 v39, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s2 :: v_dual_mov_b32 v37, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s16 :: v_dual_mov_b32 v35, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v33, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s46 :: v_dual_mov_b32 v2, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s45 :: v_dual_mov_b32 v6, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s42 :: v_dual_mov_b32 v10, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s41 :: v_dual_mov_b32 v14, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v18, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v22, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v26, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s25 :: v_dual_mov_b32 v30, s14 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v23, s22 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v25, s21 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v29, s20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, s13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v27, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v19, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB49_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v37 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v34 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, v35 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v24, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + +define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { +; SI-LABEL: bitcast_v32i8_to_v8f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_4 +; SI-NEXT: .LBB50_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB50_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: .LBB50_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v5, v19, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v6, v15, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i8_to_v8f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v2 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: s_cbranch_execnz .LBB50_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB25_4 -; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB50_4 +; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB25_3: ; %cmp.false +; VI-NEXT: .LBB50_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -6656,8 +13590,8 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 -; VI-NEXT: .LBB25_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: .LBB50_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v31 ; VI-NEXT: v_add_u16_e32 v1, 3, v32 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -6749,14 +13683,14 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: s_cbranch_execnz .LBB50_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB25_4 -; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB50_4 +; GFX9-NEXT: .LBB50_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB25_3: ; %cmp.false +; GFX9-NEXT: .LBB50_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -6814,8 +13748,8 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 -; GFX9-NEXT: .LBB25_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB50_2 +; GFX9-NEXT: .LBB50_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -6915,14 +13849,14 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_4 -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB25_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h @@ -7014,8 +13948,8 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-TRUE16-NEXT: .LBB25_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 @@ -7140,14 +14074,14 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_4 -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB25_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 @@ -7237,8 +14171,8 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-FAKE16-NEXT: .LBB25_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-FAKE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -7346,27 +14280,968 @@ end: ret <8 x float> %phi } +define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i8_to_v8f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v17 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v23, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v21 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v32i8_to_v8f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v23, v5 +; VI-NEXT: v_mov_b32_e32 v24, v4 +; VI-NEXT: v_mov_b32_e32 v21, v2 +; VI-NEXT: v_mov_b32_e32 v20, v1 +; VI-NEXT: v_mov_b32_e32 v19, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; VI-NEXT: v_or_b32_sdwa v0, v21, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 +; VI-NEXT: v_or_b32_sdwa v2, v22, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s19, 24 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v19 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s28, 0xff +; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v13 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v21 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; VI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v17 +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v23 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_or_b32_sdwa v9, v25, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v5, v5, v8 +; VI-NEXT: v_or_b32_sdwa v8, v18, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v3, v3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x300, v13 +; VI-NEXT: v_or_b32_e32 v4, v4, v12 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v8 +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v32i8_to_v8f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v4 +; GFX9-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v17 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v19 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v8f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v21 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v8 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v8f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v6 :: v_dual_mov_b32 v17, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v15, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v14 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v20 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v16 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v17 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v18 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v14, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v9, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v15 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v20, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + define <4 x double> @bitcast_v4i64_to_v4f64(<4 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i64_to_v4f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: .LBB26_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i64_to_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v4f64: ; VI: ; %bb.0: @@ -7416,7 +15291,7 @@ define <4 x double> @bitcast_v4i64_to_v4f64(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -7428,7 +15303,7 @@ define <4 x double> @bitcast_v4i64_to_v4f64(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo ; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: .LBB52_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7448,23 +15323,153 @@ end: ret <4 x double> %phi } +define inreg <4 x double> @bitcast_v4i64_to_v4f64_scalar(<4 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i64_to_v4f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v4i64_to_v4f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v4i64_to_v4f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-LABEL: bitcast_v4i64_to_v4f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: .LBB53_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + define <4 x i64> @bitcast_v4f64_to_v4i64(<4 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f64_to_v4i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f64_to_v4i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v4i64: ; VI: ; %bb.0: @@ -7473,13 +15478,13 @@ define <4 x i64> @bitcast_v4f64_to_v4i64(<4 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7490,13 +15495,13 @@ define <4 x i64> @bitcast_v4f64_to_v4i64(<4 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX9-NEXT: .LBB27_2: ; %end +; GFX9-NEXT: .LBB54_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7508,13 +15513,13 @@ define <4 x i64> @bitcast_v4f64_to_v4i64(<4 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: .LBB27_2: ; %end +; GFX11-NEXT: .LBB54_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7534,62 +15539,184 @@ end: ret <4 x i64> %phi } +define inreg <4 x i64> @bitcast_v4f64_to_v4i64_scalar(<4 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f64_to_v4i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB55_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB55_4 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_3: +; SI-NEXT: s_branch .LBB55_2 +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v4i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB55_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_4 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_3: +; VI-NEXT: s_branch .LBB55_2 +; VI-NEXT: .LBB55_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v4i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: +; GFX9-NEXT: s_branch .LBB55_2 +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v4i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + define <16 x i16> @bitcast_v4i64_to_v16i16(<4 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i64_to_v16i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v8, v16 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i64_to_v16i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v16i16: ; VI: ; %bb.0: @@ -7639,7 +15766,7 @@ define <16 x i16> @bitcast_v4i64_to_v16i16(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -7651,7 +15778,7 @@ define <16 x i16> @bitcast_v4i64_to_v16i16(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB28_2: ; %end +; GFX11-NEXT: .LBB56_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7671,117 +15798,285 @@ end: ret <16 x i16> %phi } -define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i16_to_v4i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v4 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_4 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB29_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v0, v0, v22 -; GCN-NEXT: v_or_b32_e32 v1, v1, v23 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v2, v2, v16 -; GCN-NEXT: v_or_b32_e32 v3, v3, v21 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: .LBB29_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v0, v22, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_or_b32_e32 v2, v16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v21, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x i16> @bitcast_v4i64_to_v16i16_scalar(<4 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i64_to_v16i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: s_lshr_b32 s7, s21, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: s_lshr_b32 s7, s21, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s7 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB57_2 ; -; VI-LABEL: bitcast_v16i16_to_v4i64: +; VI-LABEL: bitcast_v4i64_to_v16i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v9, 3 -; VI-NEXT: v_add_u16_e32 v8, 3, v7 -; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v4i64_to_v16i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_3 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB57_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: s_branch .LBB57_2 +; +; GFX11-LABEL: bitcast_v4i64_to_v16i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_3 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB57_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: s_branch .LBB57_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + +define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v16i16_to_v4i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_4 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB58_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: .LBB58_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v4, v21, v4 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: v_or_b32_e32 v6, v11, v6 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i16_to_v4i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB58_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v9, 3 +; VI-NEXT: v_add_u16_e32 v8, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v7, v8, v7 ; VI-NEXT: v_add_u16_e32 v8, 3, v6 ; VI-NEXT: v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -7804,7 +16099,7 @@ define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v8, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v8, v0 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7836,7 +16131,7 @@ define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] @@ -7846,7 +16141,7 @@ define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB29_2: ; %end +; GFX11-NEXT: .LBB58_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7866,114 +16161,347 @@ end: ret <4 x i64> %phi } +define inreg <4 x i64> @bitcast_v16i16_to_v4i64_scalar(<16 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i16_to_v4i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v0, v9 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v16i16_to_v4i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB59_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_3 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB59_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_4: +; VI-NEXT: s_branch .LBB59_2 +; +; GFX9-LABEL: bitcast_v16i16_to_v4i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v4i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_3: +; GFX11-NEXT: s_branch .LBB59_2 +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i64_to_v16f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, v7 -; GCN-NEXT: v_mov_b32_e32 v17, v6 -; GCN-NEXT: v_mov_b32_e32 v20, v5 -; GCN-NEXT: v_mov_b32_e32 v19, v4 -; GCN-NEXT: v_mov_b32_e32 v22, v3 -; GCN-NEXT: v_mov_b32_e32 v21, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NEXT: v_mov_b32_e32 v16, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB30_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB30_4 -; GCN-NEXT: .LBB30_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB30_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB30_2 -; GCN-NEXT: .LBB30_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v18, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i64_to_v16f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v7 +; SI-NEXT: v_mov_b32_e32 v16, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v21, v3 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v23, v1 +; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_4 +; SI-NEXT: .LBB60_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB60_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: .LBB60_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v17, vcc +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v16f16: ; VI: ; %bb.0: @@ -8023,7 +16551,7 @@ define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-NEXT: s_cbranch_execz .LBB60_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -8035,7 +16563,7 @@ define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB30_2: ; %end +; GFX11-NEXT: .LBB60_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8055,139 +16583,327 @@ end: ret <16 x half> %phi } +define inreg <16 x half> @bitcast_v4i64_to_v16f16_scalar(<4 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i64_to_v16f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB61_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB61_3 +; SI-NEXT: .LBB61_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_add_u32 s8, s18, 3 +; SI-NEXT: s_addc_u32 s9, s19, 0 +; SI-NEXT: s_lshr_b32 s10, s8, 16 +; SI-NEXT: s_lshr_b32 s11, s9, 16 +; SI-NEXT: s_add_u32 s12, s20, 3 +; SI-NEXT: s_addc_u32 s13, s21, 0 +; SI-NEXT: s_lshr_b32 s14, s12, 16 +; SI-NEXT: s_lshr_b32 s15, s13, 16 +; SI-NEXT: s_add_u32 s16, s22, 3 +; SI-NEXT: s_addc_u32 s17, s23, 0 +; SI-NEXT: s_lshr_b32 s18, s16, 16 +; SI-NEXT: s_lshr_b32 s19, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB61_2 +; +; VI-LABEL: bitcast_v4i64_to_v16f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB61_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB61_3 +; VI-NEXT: .LBB61_2: ; %cmp.true +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB61_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB61_4: +; VI-NEXT: s_branch .LBB61_2 +; +; GFX9-LABEL: bitcast_v4i64_to_v16f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB61_3 +; GFX9-NEXT: .LBB61_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB61_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB61_4: +; GFX9-NEXT: s_branch .LBB61_2 +; +; GFX11-LABEL: bitcast_v4i64_to_v16f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB61_3 +; GFX11-NEXT: .LBB61_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB61_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB61_4: +; GFX11-NEXT: s_branch .LBB61_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f16_to_v4i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v14 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB31_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB31_4 -; GCN-NEXT: .LBB31_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB31_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: v_or_b32_e32 v0, v25, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; GCN-NEXT: v_or_b32_e32 v2, v19, v2 -; GCN-NEXT: v_or_b32_e32 v3, v17, v3 -; GCN-NEXT: v_or_b32_e32 v4, v16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v9, v6 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB31_2 -; GCN-NEXT: .LBB31_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v23 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v11, v12 -; GCN-NEXT: v_or_b32_e32 v6, v9, v13 -; GCN-NEXT: v_or_b32_e32 v7, v8, v10 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f16_to_v4i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB62_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB62_4 +; SI-NEXT: .LBB62_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB62_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v3, v21, v3 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v5, v17, v5 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_2 +; SI-NEXT: .LBB62_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v4i64: ; VI: ; %bb.0: @@ -8196,7 +16912,7 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB31_2 +; VI-NEXT: s_cbranch_execz .LBB62_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v8, 0x200 ; VI-NEXT: v_add_f16_sdwa v9, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -8223,7 +16939,7 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v9 ; VI-NEXT: v_or_b32_e32 v0, v0, v8 -; VI-NEXT: .LBB31_2: ; %end +; VI-NEXT: .LBB62_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8256,7 +16972,7 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB31_2 +; GFX11-NEXT: s_cbranch_execz .LBB62_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] @@ -8266,7 +16982,7 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB31_2: ; %end +; GFX11-NEXT: .LBB62_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8286,98 +17002,352 @@ end: ret <4 x i64> %phi } +define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f16_to_v4i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 +; SI-NEXT: v_or_b32_e32 v5, v12, v5 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: .LBB63_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB63_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB63_2 +; +; VI-LABEL: bitcast_v16f16_to_v4i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB63_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB63_4 +; VI-NEXT: .LBB63_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v8, v1 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_add_f16_sdwa v8, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v8 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB63_3: +; VI-NEXT: s_branch .LBB63_2 +; VI-NEXT: .LBB63_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v4i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB63_4 +; GFX9-NEXT: .LBB63_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB63_3: +; GFX9-NEXT: s_branch .LBB63_2 +; GFX9-NEXT: .LBB63_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v4i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB63_4 +; GFX11-NEXT: .LBB63_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB63_3: +; GFX11-NEXT: s_branch .LBB63_2 +; GFX11-NEXT: .LBB63_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + define <16 x bfloat> @bitcast_v4i64_to_v16bf16(<4 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i64_to_v16bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v23, v7 -; GCN-NEXT: v_mov_b32_e32 v22, v6 -; GCN-NEXT: v_mov_b32_e32 v21, v5 -; GCN-NEXT: v_mov_b32_e32 v20, v4 -; GCN-NEXT: v_mov_b32_e32 v19, v3 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v1 -; GCN-NEXT: v_mov_b32_e32 v16, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_4 -; GCN-NEXT: .LBB32_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB32_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v23 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB32_2 -; GCN-NEXT: .LBB32_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v23, vcc -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i64_to_v16bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v7 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v5 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v1 +; SI-NEXT: v_mov_b32_e32 v16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_4 +; SI-NEXT: .LBB64_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB64_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_2 +; SI-NEXT: .LBB64_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v23, vcc +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v16bf16: ; VI: ; %bb.0: @@ -8427,7 +17397,7 @@ define <16 x bfloat> @bitcast_v4i64_to_v16bf16(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-NEXT: s_cbranch_execz .LBB64_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -8439,7 +17409,7 @@ define <16 x bfloat> @bitcast_v4i64_to_v16bf16(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB32_2: ; %end +; GFX11-NEXT: .LBB64_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8459,123 +17429,311 @@ end: ret <16 x bfloat> %phi } +define inreg <16 x bfloat> @bitcast_v4i64_to_v16bf16_scalar(<4 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i64_to_v16bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_and_b32 s8, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s22, 16 +; SI-NEXT: s_and_b32 s10, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s21, 16 +; SI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s20, 16 +; SI-NEXT: s_and_b32 s14, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s19, 16 +; SI-NEXT: s_and_b32 s24, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s18, 16 +; SI-NEXT: s_and_b32 s26, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s17, 16 +; SI-NEXT: s_and_b32 s28, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: .LBB65_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_add_u32 s16, s18, 3 +; SI-NEXT: s_addc_u32 s15, s19, 0 +; SI-NEXT: s_add_u32 s13, s20, 3 +; SI-NEXT: s_addc_u32 s11, s21, 0 +; SI-NEXT: s_add_u32 s9, s22, 3 +; SI-NEXT: s_addc_u32 s7, s23, 0 +; SI-NEXT: s_and_b32 s6, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s8, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s10, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s12, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s14, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_and_b32 s24, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s16, 16 +; SI-NEXT: s_and_b32 s26, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s5, 16 +; SI-NEXT: s_and_b32 s28, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s4, 16 +; SI-NEXT: .LBB65_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s29 +; SI-NEXT: v_mov_b32_e32 v1, s28 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: v_mov_b32_e32 v3, s26 +; SI-NEXT: v_mov_b32_e32 v4, s25 +; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: v_mov_b32_e32 v6, s15 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v8, s13 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v10, s11 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v12, s9 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v14, s7 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB65_4: +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB65_2 +; +; VI-LABEL: bitcast_v4i64_to_v16bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB65_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB65_3 +; VI-NEXT: .LBB65_2: ; %cmp.true +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB65_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB65_4: +; VI-NEXT: s_branch .LBB65_2 +; +; GFX9-LABEL: bitcast_v4i64_to_v16bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB65_3 +; GFX9-NEXT: .LBB65_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB65_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB65_4: +; GFX9-NEXT: s_branch .LBB65_2 +; +; GFX11-LABEL: bitcast_v4i64_to_v16bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB65_3 +; GFX11-NEXT: .LBB65_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB65_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB65_4: +; GFX11-NEXT: s_branch .LBB65_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v16bf16_to_v4i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB33_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB33_4 -; GCN-NEXT: .LBB33_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB33_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GCN-NEXT: v_alignbit_b32 v0, v0, v26, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v24, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GCN-NEXT: v_alignbit_b32 v2, v2, v20, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; GCN-NEXT: v_alignbit_b32 v6, v6, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB33_2 -; GCN-NEXT: .LBB33_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v6, v13, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v9, v8, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16bf16_to_v4i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB66_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB66_4 +; SI-NEXT: .LBB66_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB66_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_2 +; SI-NEXT: .LBB66_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16bf16_to_v4i64: ; VI: ; %bb.0: @@ -8584,7 +17742,7 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB33_2 +; VI-NEXT: s_cbranch_execz .LBB66_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -8731,7 +17889,7 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; VI-NEXT: .LBB33_2: ; %end +; VI-NEXT: .LBB66_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8742,7 +17900,7 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB33_2 +; GFX9-NEXT: s_cbranch_execz .LBB66_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -8866,7 +18024,7 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc ; GFX9-NEXT: v_perm_b32 v0, v0, v8, s7 -; GFX9-NEXT: .LBB33_2: ; %end +; GFX9-NEXT: .LBB66_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8878,7 +18036,7 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9022,7 +18180,7 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v11 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v14, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v12, v0 -; GFX11-TRUE16-NEXT: .LBB33_2: ; %end +; GFX11-TRUE16-NEXT: .LBB66_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9034,7 +18192,7 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v7 @@ -9165,7 +18323,7 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v13, v18, vcc_lo ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v12, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB33_2: ; %end +; GFX11-FAKE16-NEXT: .LBB66_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9185,194 +18343,847 @@ end: ret <4 x i64> %phi } -define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i64_to_v32i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v28, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v20, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: .LBB34_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: .LBB34_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v8, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16bf16_to_v4i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: s_cbranch_scc0 .LBB67_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: s_cbranch_execnz .LBB67_3 +; SI-NEXT: .LBB67_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB67_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB67_2 ; -; VI-LABEL: bitcast_v4i64_to_v32i8: +; VI-LABEL: bitcast_v16bf16_to_v4i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v34, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr2 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr10 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr14 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB67_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: .LBB34_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB34_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v34 -; VI-NEXT: v_addc_u32_e32 v35, vcc, 0, v35, vcc -; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 -; VI-NEXT: v_addc_u32_e32 v33, vcc, 0, v33, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: s_cbranch_execnz .LBB67_4 +; VI-NEXT: .LBB67_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB67_3: +; VI-NEXT: s_branch .LBB67_2 +; VI-NEXT: .LBB67_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v4i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB67_4 +; GFX9-NEXT: .LBB67_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v2 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v1 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v9 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v9, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v9 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_bfe_u32 v10, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_and_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB67_3: +; GFX9-NEXT: s_branch .LBB67_2 +; GFX9-NEXT: .LBB67_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v4i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB67_4 +; GFX11-NEXT: .LBB67_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s8, s7, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-NEXT: s_and_b32 s8, s6, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_and_b32 s7, s5, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1 +; GFX11-NEXT: s_and_b32 s5, s4, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v11, v6 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v9, v10 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s3 +; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v6, v8 +; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v10 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v8 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v3, v9, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s2 +; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v9 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_bfe_u32 v11, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_bfe_u32 v16, v14, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s2 +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s1 +; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v15, v9, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v1, v10, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v0, v12, 16, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB67_3: +; GFX11-NEXT: s_branch .LBB67_2 +; GFX11-NEXT: .LBB67_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + +define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v4i64_to_v32i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: .LBB68_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: .LBB68_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i64_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB68_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: .LBB68_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB68_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v34 +; VI-NEXT: v_addc_u32_e32 v35, vcc, 0, v35, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_addc_u32_e32 v33, vcc, 0, v33, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 ; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 @@ -9386,7 +19197,7 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: .LBB34_4: ; %end +; VI-NEXT: .LBB68_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v1 ; VI-NEXT: v_mov_b32_e32 v8, v34 @@ -9434,7 +19245,7 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB34_2 +; GFX9-NEXT: s_cbranch_execz .LBB68_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -9460,9 +19271,9 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: .LBB34_2: ; %Flow +; GFX9-NEXT: .LBB68_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB34_4 +; GFX9-NEXT: s_cbranch_execz .LBB68_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -9496,7 +19307,7 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: .LBB34_4: ; %end +; GFX9-NEXT: .LBB68_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 @@ -9536,7 +19347,7 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB68_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[32:33] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[26:27] @@ -9554,9 +19365,9 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[18:19] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB34_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB68_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB68_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9585,7 +19396,7 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB34_4: ; %end +; GFX11-TRUE16-NEXT: .LBB68_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -9643,7 +19454,7 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB68_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 @@ -9669,9 +19480,9 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB34_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB68_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB68_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v36, vcc_lo, v36, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9708,7 +19519,7 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB34_4: ; %end +; GFX11-FAKE16-NEXT: .LBB68_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 @@ -9736,228 +19547,856 @@ end: ret <32 x i8> %phi } +define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i64_to_v32i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 +; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 +; SI-NEXT: v_alignbit_b32 v25, s23, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 +; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, s21, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s23, 24 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 8 +; SI-NEXT: s_lshr_b32 s8, s21, 24 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 8 +; SI-NEXT: s_lshr_b32 s11, s19, 24 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 8 +; SI-NEXT: s_lshr_b32 s15, s17, 24 +; SI-NEXT: s_lshr_b32 s24, s17, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: .LBB69_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 +; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 +; SI-NEXT: v_alignbit_b32 v25, s23, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 +; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, s21, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s23, 24 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 8 +; SI-NEXT: s_lshr_b32 s8, s21, 24 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 8 +; SI-NEXT: s_lshr_b32 s11, s19, 24 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 8 +; SI-NEXT: s_lshr_b32 s15, s17, 24 +; SI-NEXT: s_lshr_b32 s24, s17, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 8 +; SI-NEXT: .LBB69_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s25 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_mov_b32_e32 v13, s14 +; SI-NEXT: v_mov_b32_e32 v14, s13 +; SI-NEXT: v_mov_b32_e32 v15, s11 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_mov_b32_e32 v21, s12 +; SI-NEXT: v_mov_b32_e32 v22, s10 +; SI-NEXT: v_mov_b32_e32 v23, s8 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v28, s23 +; SI-NEXT: v_mov_b32_e32 v29, s9 +; SI-NEXT: v_mov_b32_e32 v30, s7 +; SI-NEXT: v_mov_b32_e32 v31, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB69_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB69_2 +; +; VI-LABEL: bitcast_v4i64_to_v32i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB69_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s23, 24 +; VI-NEXT: s_lshr_b32 s15, s23, 16 +; VI-NEXT: s_lshr_b32 s24, s23, 8 +; VI-NEXT: s_lshr_b32 s25, s22, 16 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: s_lshr_b32 s27, s21, 24 +; VI-NEXT: s_lshr_b32 s28, s21, 16 +; VI-NEXT: s_lshr_b32 s29, s21, 8 +; VI-NEXT: s_lshr_b32 s40, s20, 16 +; VI-NEXT: s_lshr_b32 s41, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s43, s19, 16 +; VI-NEXT: s_lshr_b32 s44, s19, 8 +; VI-NEXT: s_lshr_b32 s45, s18, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: s_lshr_b32 s47, s17, 24 +; VI-NEXT: s_lshr_b32 s56, s17, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 8 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB69_3 +; VI-NEXT: .LBB69_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s14, s23, 24 +; VI-NEXT: s_lshr_b32 s15, s23, 16 +; VI-NEXT: s_lshr_b32 s24, s23, 8 +; VI-NEXT: s_lshr_b32 s25, s22, 16 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: s_lshr_b32 s27, s21, 24 +; VI-NEXT: s_lshr_b32 s28, s21, 16 +; VI-NEXT: s_lshr_b32 s29, s21, 8 +; VI-NEXT: s_lshr_b32 s40, s20, 16 +; VI-NEXT: s_lshr_b32 s41, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s43, s19, 16 +; VI-NEXT: s_lshr_b32 s44, s19, 8 +; VI-NEXT: s_lshr_b32 s45, s18, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: s_lshr_b32 s47, s17, 24 +; VI-NEXT: s_lshr_b32 s56, s17, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 8 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: .LBB69_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s59 +; VI-NEXT: v_mov_b32_e32 v2, s58 +; VI-NEXT: v_mov_b32_e32 v3, s10 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s57 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s47 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s46 +; VI-NEXT: v_mov_b32_e32 v10, s45 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s44 +; VI-NEXT: v_mov_b32_e32 v14, s43 +; VI-NEXT: v_mov_b32_e32 v15, s42 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v17, s41 +; VI-NEXT: v_mov_b32_e32 v18, s40 +; VI-NEXT: v_mov_b32_e32 v19, s6 +; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_mov_b32_e32 v21, s29 +; VI-NEXT: v_mov_b32_e32 v22, s28 +; VI-NEXT: v_mov_b32_e32 v23, s27 +; VI-NEXT: v_mov_b32_e32 v24, s22 +; VI-NEXT: v_mov_b32_e32 v25, s26 +; VI-NEXT: v_mov_b32_e32 v26, s25 +; VI-NEXT: v_mov_b32_e32 v27, s4 +; VI-NEXT: v_mov_b32_e32 v28, s23 +; VI-NEXT: v_mov_b32_e32 v29, s24 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v31, s14 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB69_4: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB69_2 +; +; GFX9-LABEL: bitcast_v4i64_to_v32i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s23, 24 +; GFX9-NEXT: s_lshr_b32 s15, s23, 16 +; GFX9-NEXT: s_lshr_b32 s24, s23, 8 +; GFX9-NEXT: s_lshr_b32 s25, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: s_lshr_b32 s27, s21, 24 +; GFX9-NEXT: s_lshr_b32 s28, s21, 16 +; GFX9-NEXT: s_lshr_b32 s29, s21, 8 +; GFX9-NEXT: s_lshr_b32 s40, s20, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 8 +; GFX9-NEXT: s_lshr_b32 s42, s19, 24 +; GFX9-NEXT: s_lshr_b32 s43, s19, 16 +; GFX9-NEXT: s_lshr_b32 s44, s19, 8 +; GFX9-NEXT: s_lshr_b32 s45, s18, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: s_lshr_b32 s47, s17, 24 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s17, 8 +; GFX9-NEXT: s_lshr_b32 s58, s16, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB69_3 +; GFX9-NEXT: .LBB69_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s14, s23, 24 +; GFX9-NEXT: s_lshr_b32 s15, s23, 16 +; GFX9-NEXT: s_lshr_b32 s24, s23, 8 +; GFX9-NEXT: s_lshr_b32 s25, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: s_lshr_b32 s27, s21, 24 +; GFX9-NEXT: s_lshr_b32 s28, s21, 16 +; GFX9-NEXT: s_lshr_b32 s29, s21, 8 +; GFX9-NEXT: s_lshr_b32 s40, s20, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 8 +; GFX9-NEXT: s_lshr_b32 s42, s19, 24 +; GFX9-NEXT: s_lshr_b32 s43, s19, 16 +; GFX9-NEXT: s_lshr_b32 s44, s19, 8 +; GFX9-NEXT: s_lshr_b32 s45, s18, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: s_lshr_b32 s47, s17, 24 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s17, 8 +; GFX9-NEXT: s_lshr_b32 s58, s16, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 8 +; GFX9-NEXT: .LBB69_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s59 +; GFX9-NEXT: v_mov_b32_e32 v2, s58 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s57 +; GFX9-NEXT: v_mov_b32_e32 v6, s56 +; GFX9-NEXT: v_mov_b32_e32 v7, s47 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v9, s46 +; GFX9-NEXT: v_mov_b32_e32 v10, s45 +; GFX9-NEXT: v_mov_b32_e32 v11, s8 +; GFX9-NEXT: v_mov_b32_e32 v12, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s44 +; GFX9-NEXT: v_mov_b32_e32 v14, s43 +; GFX9-NEXT: v_mov_b32_e32 v15, s42 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s41 +; GFX9-NEXT: v_mov_b32_e32 v18, s40 +; GFX9-NEXT: v_mov_b32_e32 v19, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s21 +; GFX9-NEXT: v_mov_b32_e32 v21, s29 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 +; GFX9-NEXT: v_mov_b32_e32 v24, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s26 +; GFX9-NEXT: v_mov_b32_e32 v26, s25 +; GFX9-NEXT: v_mov_b32_e32 v27, s4 +; GFX9-NEXT: v_mov_b32_e32 v28, s23 +; GFX9-NEXT: v_mov_b32_e32 v29, s24 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v31, s14 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB69_4: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr24 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB69_2 +; +; GFX11-TRUE16-LABEL: bitcast_v4i64_to_v32i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB69_3 +; GFX11-TRUE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-TRUE16-NEXT: .LBB69_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s12 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB69_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB69_2 +; +; GFX11-FAKE16-LABEL: bitcast_v4i64_to_v32i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB69_3 +; GFX11-FAKE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: .LBB69_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s45 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s44 :: v_dual_mov_b32 v3, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s41 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s29 :: v_dual_mov_b32 v11, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v13, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s27 :: v_dual_mov_b32 v15, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s17 :: v_dual_mov_b32 v21, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s15 :: v_dual_mov_b32 v27, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s19 :: v_dual_mov_b32 v29, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB69_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: s_branch .LBB69_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i8_to_v4i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB35_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB35_4 -; GCN-NEXT: .LBB35_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB35_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v0, v0, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v12, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v16, v16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v20, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v9, v13, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v13, v17, v18 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v15, v19, v21 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: v_or_b32_e32 v4, v8, v9 -; GCN-NEXT: v_or_b32_e32 v5, v10, v11 -; GCN-NEXT: v_or_b32_e32 v6, v12, v13 -; GCN-NEXT: v_or_b32_e32 v7, v14, v15 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB35_2 -; GCN-NEXT: .LBB35_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v0, v37, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v39, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v48, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v49, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v21, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v16, v23, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v25, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v13, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v16 -; GCN-NEXT: v_or_b32_e32 v13, v17, v18 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x300, v20 -; GCN-NEXT: v_or_b32_e32 v15, v19, v21 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i8_to_v4i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_4 +; SI-NEXT: .LBB70_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB70_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_2 +; SI-NEXT: .LBB70_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v5, v19, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v6, v15, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v4i64: ; VI: ; %bb.0: @@ -9990,14 +20429,14 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: s_cbranch_execnz .LBB70_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB35_4 -; VI-NEXT: .LBB35_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB70_4 +; VI-NEXT: .LBB70_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB35_3: ; %cmp.false +; VI-NEXT: .LBB70_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -10055,8 +20494,8 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB35_2 -; VI-NEXT: .LBB35_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB70_2 +; VI-NEXT: .LBB70_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v31 ; VI-NEXT: v_add_u16_e32 v1, 3, v32 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -10148,14 +20587,14 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: s_cbranch_execnz .LBB70_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB35_4 -; GFX9-NEXT: .LBB35_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB70_4 +; GFX9-NEXT: .LBB70_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB35_3: ; %cmp.false +; GFX9-NEXT: .LBB70_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -10213,8 +20652,8 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB35_2 -; GFX9-NEXT: .LBB35_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB70_2 +; GFX9-NEXT: .LBB70_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -10314,14 +20753,14 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-TRUE16-NEXT: .LBB35_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-TRUE16-NEXT: .LBB70_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h @@ -10413,8 +20852,8 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-TRUE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 @@ -10539,14 +20978,14 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-FAKE16-NEXT: .LBB35_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-FAKE16-NEXT: .LBB70_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 @@ -10636,8 +21075,8 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-FAKE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-FAKE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -10745,66 +21184,1007 @@ end: ret <4 x i64> %phi } +define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i8_to_v4i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v17 +; SI-NEXT: s_cbranch_scc0 .LBB71_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v23, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB71_3 +; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v21 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB71_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB71_2 +; +; VI-LABEL: bitcast_v32i8_to_v4i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v23, v5 +; VI-NEXT: v_mov_b32_e32 v24, v4 +; VI-NEXT: v_mov_b32_e32 v21, v2 +; VI-NEXT: v_mov_b32_e32 v20, v1 +; VI-NEXT: v_mov_b32_e32 v19, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; VI-NEXT: s_cbranch_scc0 .LBB71_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; VI-NEXT: v_or_b32_sdwa v0, v21, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 +; VI-NEXT: v_or_b32_sdwa v2, v22, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB71_3 +; VI-NEXT: .LBB71_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s19, 24 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v19 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s28, 0xff +; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v13 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v21 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; VI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v17 +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v23 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_or_b32_sdwa v9, v25, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v5, v5, v8 +; VI-NEXT: v_or_b32_sdwa v8, v18, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v3, v3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x300, v13 +; VI-NEXT: v_or_b32_e32 v4, v4, v12 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v8 +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB71_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB71_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_branch .LBB71_2 +; +; GFX9-LABEL: bitcast_v32i8_to_v4i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v4 +; GFX9-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v17 +; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB71_3 +; GFX9-NEXT: .LBB71_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v19 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB71_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB71_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_branch .LBB71_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v4i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-TRUE16-NEXT: .LBB71_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v21 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v8 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: .LBB71_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB71_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-TRUE16-NEXT: s_branch .LBB71_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v4i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v6 :: v_dual_mov_b32 v17, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v15, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v14 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v20 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-FAKE16-NEXT: .LBB71_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v16 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v17 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v18 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v14, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v9, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v15 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v20, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: .LBB71_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB71_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-FAKE16-NEXT: s_branch .LBB71_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + define <16 x i16> @bitcast_v4f64_to_v16i16(<4 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f64_to_v16i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v7 -; GCN-NEXT: v_mov_b32_e32 v16, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v5 -; GCN-NEXT: v_mov_b32_e32 v18, v4 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v20, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NEXT: v_mov_b32_e32 v22, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v13, v17, v16, 16 -; GCN-NEXT: v_alignbit_b32 v9, v19, v18, 16 -; GCN-NEXT: v_alignbit_b32 v5, v21, v20, 16 -; GCN-NEXT: v_alignbit_b32 v1, v23, v22, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v23 -; GCN-NEXT: .LBB36_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_alignbit_b32 v13, v17, v16, 16 -; GCN-NEXT: v_alignbit_b32 v9, v19, v18, 16 -; GCN-NEXT: v_alignbit_b32 v5, v21, v20, 16 -; GCN-NEXT: v_alignbit_b32 v1, v23, v22, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v23 -; GCN-NEXT: .LBB36_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v22 -; GCN-NEXT: v_mov_b32_e32 v2, v23 -; GCN-NEXT: v_mov_b32_e32 v4, v20 -; GCN-NEXT: v_mov_b32_e32 v6, v21 -; GCN-NEXT: v_mov_b32_e32 v8, v18 -; GCN-NEXT: v_mov_b32_e32 v10, v19 -; GCN-NEXT: v_mov_b32_e32 v12, v16 -; GCN-NEXT: v_mov_b32_e32 v14, v17 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f64_to_v16i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v7 +; SI-NEXT: v_mov_b32_e32 v16, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v21, v3 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v23, v1 +; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v13, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v9, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v5, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB72_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_alignbit_b32 v13, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v9, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v5, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: .LBB72_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v22 +; SI-NEXT: v_mov_b32_e32 v2, v23 +; SI-NEXT: v_mov_b32_e32 v4, v20 +; SI-NEXT: v_mov_b32_e32 v6, v21 +; SI-NEXT: v_mov_b32_e32 v8, v18 +; SI-NEXT: v_mov_b32_e32 v10, v19 +; SI-NEXT: v_mov_b32_e32 v12, v16 +; SI-NEXT: v_mov_b32_e32 v14, v17 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v16i16: ; VI: ; %bb.0: @@ -10813,13 +22193,13 @@ define <16 x i16> @bitcast_v4f64_to_v16i16(<4 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: s_cbranch_execz .LBB72_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: .LBB72_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -10830,13 +22210,13 @@ define <16 x i16> @bitcast_v4f64_to_v16i16(<4 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: s_cbranch_execz .LBB72_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: .LBB72_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10848,13 +22228,13 @@ define <16 x i16> @bitcast_v4f64_to_v16i16(<4 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: .LBB72_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -10874,104 +22254,268 @@ end: ret <16 x i16> %phi } +define inreg <16 x i16> @bitcast_v4f64_to_v16i16_scalar(<4 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f64_to_v16i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB73_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB73_4 +; SI-NEXT: .LBB73_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[22:23], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[20:21], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[18:19], s[20:21], 1.0 +; SI-NEXT: v_alignbit_b32 v13, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v9, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v5, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: s_branch .LBB73_5 +; SI-NEXT: .LBB73_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: s_branch .LBB73_2 +; SI-NEXT: .LBB73_4: +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: .LBB73_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v22 +; SI-NEXT: v_mov_b32_e32 v2, v23 +; SI-NEXT: v_mov_b32_e32 v4, v20 +; SI-NEXT: v_mov_b32_e32 v6, v21 +; SI-NEXT: v_mov_b32_e32 v8, v18 +; SI-NEXT: v_mov_b32_e32 v10, v19 +; SI-NEXT: v_mov_b32_e32 v12, v16 +; SI-NEXT: v_mov_b32_e32 v14, v17 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v16i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB73_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB73_4 +; VI-NEXT: .LBB73_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB73_3: +; VI-NEXT: s_branch .LBB73_2 +; VI-NEXT: .LBB73_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v16i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB73_4 +; GFX9-NEXT: .LBB73_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB73_3: +; GFX9-NEXT: s_branch .LBB73_2 +; GFX9-NEXT: .LBB73_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v16i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB73_4 +; GFX11-NEXT: .LBB73_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB73_3: +; GFX11-NEXT: s_branch .LBB73_2 +; GFX11-NEXT: .LBB73_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i16_to_v4f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v4 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_4 -; GCN-NEXT: .LBB37_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB37_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v0, v0, v22 -; GCN-NEXT: v_or_b32_e32 v1, v1, v23 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v2, v2, v16 -; GCN-NEXT: v_or_b32_e32 v3, v3, v21 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB37_2 -; GCN-NEXT: .LBB37_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v0, v22, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_or_b32_e32 v2, v16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v21, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i16_to_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB74_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB74_4 +; SI-NEXT: .LBB74_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB74_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_2 +; SI-NEXT: .LBB74_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v4, v21, v4 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: v_or_b32_e32 v6, v11, v6 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i16_to_v4f64: ; VI: ; %bb.0: @@ -10980,7 +22524,7 @@ define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB37_2 +; VI-NEXT: s_cbranch_execz .LBB74_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v9, 3 ; VI-NEXT: v_add_u16_e32 v8, 3, v7 @@ -11007,7 +22551,7 @@ define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v8, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v8, v0 -; VI-NEXT: .LBB37_2: ; %end +; VI-NEXT: .LBB74_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -11039,7 +22583,7 @@ define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB37_2 +; GFX11-NEXT: s_cbranch_execz .LBB74_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] @@ -11049,7 +22593,7 @@ define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB37_2: ; %end +; GFX11-NEXT: .LBB74_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -11069,102 +22613,335 @@ end: ret <4 x double> %phi } +define inreg <4 x double> @bitcast_v16i16_to_v4f64_scalar(<16 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i16_to_v4f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_cbranch_scc0 .LBB75_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v0, v9 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB75_3 +; SI-NEXT: .LBB75_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB75_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB75_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB75_2 +; +; VI-LABEL: bitcast_v16i16_to_v4f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB75_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB75_3 +; VI-NEXT: .LBB75_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB75_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB75_4: +; VI-NEXT: s_branch .LBB75_2 +; +; GFX9-LABEL: bitcast_v16i16_to_v4f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB75_4 +; GFX9-NEXT: .LBB75_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB75_3: +; GFX9-NEXT: s_branch .LBB75_2 +; GFX9-NEXT: .LBB75_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v4f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB75_4 +; GFX11-NEXT: .LBB75_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB75_3: +; GFX11-NEXT: s_branch .LBB75_2 +; GFX11-NEXT: .LBB75_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f64_to_v16f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: .LBB38_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: .LBB38_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v19 -; GCN-NEXT: v_mov_b32_e32 v1, v23 -; GCN-NEXT: v_mov_b32_e32 v2, v16 -; GCN-NEXT: v_mov_b32_e32 v3, v22 -; GCN-NEXT: v_mov_b32_e32 v4, v17 -; GCN-NEXT: v_mov_b32_e32 v5, v21 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v20 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f64_to_v16f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: .LBB76_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: .LBB76_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v22 +; SI-NEXT: v_mov_b32_e32 v1, v23 +; SI-NEXT: v_mov_b32_e32 v2, v21 +; SI-NEXT: v_mov_b32_e32 v3, v20 +; SI-NEXT: v_mov_b32_e32 v4, v19 +; SI-NEXT: v_mov_b32_e32 v5, v17 +; SI-NEXT: v_mov_b32_e32 v6, v18 +; SI-NEXT: v_mov_b32_e32 v7, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v16f16: ; VI: ; %bb.0: @@ -11173,13 +22950,13 @@ define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: s_cbranch_execz .LBB76_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: .LBB76_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -11190,13 +22967,13 @@ define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: s_cbranch_execz .LBB76_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: .LBB76_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -11208,13 +22985,13 @@ define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: s_cbranch_execz .LBB76_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB38_2: ; %end +; GFX11-NEXT: .LBB76_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -11234,139 +23011,317 @@ end: ret <16 x half> %phi } +define inreg <16 x half> @bitcast_v4f64_to_v16f16_scalar(<4 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f64_to_v16f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: .LBB77_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB77_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB77_2 +; +; VI-LABEL: bitcast_v4f64_to_v16f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB77_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB77_4 +; VI-NEXT: .LBB77_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB77_3: +; VI-NEXT: s_branch .LBB77_2 +; VI-NEXT: .LBB77_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v16f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB77_4 +; GFX9-NEXT: .LBB77_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB77_3: +; GFX9-NEXT: s_branch .LBB77_2 +; GFX9-NEXT: .LBB77_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v16f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB77_4 +; GFX11-NEXT: .LBB77_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB77_3: +; GFX11-NEXT: s_branch .LBB77_2 +; GFX11-NEXT: .LBB77_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f16_to_v4f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v14 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB39_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB39_4 -; GCN-NEXT: .LBB39_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB39_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: v_or_b32_e32 v0, v25, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; GCN-NEXT: v_or_b32_e32 v2, v19, v2 -; GCN-NEXT: v_or_b32_e32 v3, v17, v3 -; GCN-NEXT: v_or_b32_e32 v4, v16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v9, v6 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB39_2 -; GCN-NEXT: .LBB39_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v23 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v11, v12 -; GCN-NEXT: v_or_b32_e32 v6, v9, v13 -; GCN-NEXT: v_or_b32_e32 v7, v8, v10 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f16_to_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB78_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB78_4 +; SI-NEXT: .LBB78_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB78_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v3, v21, v3 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v5, v17, v5 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_2 +; SI-NEXT: .LBB78_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v4f64: ; VI: ; %bb.0: @@ -11375,7 +23330,7 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB39_2 +; VI-NEXT: s_cbranch_execz .LBB78_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v8, 0x200 ; VI-NEXT: v_add_f16_sdwa v9, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -11402,7 +23357,7 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v9 ; VI-NEXT: v_or_b32_e32 v0, v0, v8 -; VI-NEXT: .LBB39_2: ; %end +; VI-NEXT: .LBB78_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -11435,7 +23390,7 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB39_2 +; GFX11-NEXT: s_cbranch_execz .LBB78_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] @@ -11445,7 +23400,7 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB39_2: ; %end +; GFX11-NEXT: .LBB78_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -11465,86 +23420,340 @@ end: ret <4 x double> %phi } +define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f16_to_v4f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 +; SI-NEXT: v_or_b32_e32 v5, v12, v5 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: .LBB79_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB79_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB79_2 +; +; VI-LABEL: bitcast_v16f16_to_v4f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB79_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB79_4 +; VI-NEXT: .LBB79_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v8, v1 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_add_f16_sdwa v8, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v8 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB79_3: +; VI-NEXT: s_branch .LBB79_2 +; VI-NEXT: .LBB79_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v4f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB79_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB79_4 +; GFX9-NEXT: .LBB79_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB79_3: +; GFX9-NEXT: s_branch .LBB79_2 +; GFX9-NEXT: .LBB79_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v4f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB79_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB79_4 +; GFX11-NEXT: .LBB79_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB79_3: +; GFX11-NEXT: s_branch .LBB79_2 +; GFX11-NEXT: .LBB79_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f64_to_v16bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v2 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: .LBB40_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v2 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v0 -; GCN-NEXT: .LBB40_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v23 -; GCN-NEXT: v_mov_b32_e32 v1, v22 -; GCN-NEXT: v_mov_b32_e32 v2, v21 -; GCN-NEXT: v_mov_b32_e32 v3, v20 -; GCN-NEXT: v_mov_b32_e32 v4, v19 -; GCN-NEXT: v_mov_b32_e32 v5, v18 -; GCN-NEXT: v_mov_b32_e32 v6, v17 -; GCN-NEXT: v_mov_b32_e32 v7, v16 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f64_to_v16bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: .LBB80_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v0 +; SI-NEXT: .LBB80_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v23 +; SI-NEXT: v_mov_b32_e32 v1, v22 +; SI-NEXT: v_mov_b32_e32 v2, v21 +; SI-NEXT: v_mov_b32_e32 v3, v20 +; SI-NEXT: v_mov_b32_e32 v4, v19 +; SI-NEXT: v_mov_b32_e32 v5, v18 +; SI-NEXT: v_mov_b32_e32 v6, v17 +; SI-NEXT: v_mov_b32_e32 v7, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v16bf16: ; VI: ; %bb.0: @@ -11553,13 +23762,13 @@ define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: s_cbranch_execz .LBB80_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB40_2: ; %end +; VI-NEXT: .LBB80_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -11570,13 +23779,13 @@ define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: s_cbranch_execz .LBB80_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB40_2: ; %end +; GFX9-NEXT: .LBB80_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -11588,13 +23797,13 @@ define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-NEXT: s_cbranch_execz .LBB80_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB40_2: ; %end +; GFX11-NEXT: .LBB80_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -11614,123 +23823,302 @@ end: ret <16 x bfloat> %phi } +define inreg <16 x bfloat> @bitcast_v4f64_to_v16bf16_scalar(<4 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f64_to_v16bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB81_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s29, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s23, 16 +; SI-NEXT: s_and_b32 s27, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s22, 16 +; SI-NEXT: s_and_b32 s25, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s21, 16 +; SI-NEXT: s_and_b32 s15, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s20, 16 +; SI-NEXT: s_and_b32 s13, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s19, 16 +; SI-NEXT: s_and_b32 s11, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s18, 16 +; SI-NEXT: s_and_b32 s9, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s17, 16 +; SI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB81_4 +; SI-NEXT: .LBB81_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[20:21], 1.0 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB81_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: s_branch .LBB81_2 +; SI-NEXT: .LBB81_4: +; SI-NEXT: v_mov_b32_e32 v15, s29 +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_mov_b32_e32 v13, s27 +; SI-NEXT: v_mov_b32_e32 v12, s26 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v16bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB81_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB81_4 +; VI-NEXT: .LBB81_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB81_3: +; VI-NEXT: s_branch .LBB81_2 +; VI-NEXT: .LBB81_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v16bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB81_4 +; GFX9-NEXT: .LBB81_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB81_3: +; GFX9-NEXT: s_branch .LBB81_2 +; GFX9-NEXT: .LBB81_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v16bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB81_4 +; GFX11-NEXT: .LBB81_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB81_3: +; GFX11-NEXT: s_branch .LBB81_2 +; GFX11-NEXT: .LBB81_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v16bf16_to_v4f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB41_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB41_4 -; GCN-NEXT: .LBB41_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB41_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GCN-NEXT: v_alignbit_b32 v0, v0, v26, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v24, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GCN-NEXT: v_alignbit_b32 v2, v2, v20, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; GCN-NEXT: v_alignbit_b32 v6, v6, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB41_2 -; GCN-NEXT: .LBB41_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v6, v13, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v9, v8, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16bf16_to_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB82_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB82_4 +; SI-NEXT: .LBB82_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB82_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_2 +; SI-NEXT: .LBB82_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16bf16_to_v4f64: ; VI: ; %bb.0: @@ -11739,7 +24127,7 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB41_2 +; VI-NEXT: s_cbranch_execz .LBB82_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -11886,7 +24274,7 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -11897,7 +24285,7 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB41_2 +; GFX9-NEXT: s_cbranch_execz .LBB82_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -12021,7 +24409,7 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc ; GFX9-NEXT: v_perm_b32 v0, v0, v8, s7 -; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: .LBB82_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -12033,7 +24421,7 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -12177,7 +24565,7 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v11 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v14, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v12, v0 -; GFX11-TRUE16-NEXT: .LBB41_2: ; %end +; GFX11-TRUE16-NEXT: .LBB82_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12189,7 +24577,7 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v7 @@ -12320,7 +24708,7 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v13, v18, vcc_lo ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v12, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB41_2: ; %end +; GFX11-FAKE16-NEXT: .LBB82_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -12340,133 +24728,786 @@ end: ret <4 x double> %phi } -define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f64_to_v32i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v35, v5 -; GCN-NEXT: v_mov_b32_e32 v34, v4 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v27, v7, v6, 24 -; GCN-NEXT: v_alignbit_b32 v26, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v25, v7, v6, 8 -; GCN-NEXT: v_alignbit_b32 v19, v35, v34, 24 -; GCN-NEXT: v_alignbit_b32 v18, v35, v34, 16 -; GCN-NEXT: v_alignbit_b32 v17, v35, v34, 8 -; GCN-NEXT: v_alignbit_b32 v11, v3, v2, 24 -; GCN-NEXT: v_alignbit_b32 v10, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v9, v3, v2, 8 -; GCN-NEXT: v_alignbit_b32 v38, v1, v0, 24 -; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v33, v1, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GCN-NEXT: .LBB42_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_alignbit_b32 v27, v7, v6, 24 -; GCN-NEXT: v_alignbit_b32 v26, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v25, v7, v6, 8 -; GCN-NEXT: v_alignbit_b32 v19, v35, v34, 24 -; GCN-NEXT: v_alignbit_b32 v18, v35, v34, 16 -; GCN-NEXT: v_alignbit_b32 v17, v35, v34, 8 -; GCN-NEXT: v_alignbit_b32 v11, v3, v2, 24 -; GCN-NEXT: v_alignbit_b32 v10, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v9, v3, v2, 8 -; GCN-NEXT: v_alignbit_b32 v38, v1, v0, 24 -; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v33, v1, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GCN-NEXT: .LBB42_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v16, v34 -; GCN-NEXT: v_mov_b32_e32 v20, v35 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v28, v7 -; GCN-NEXT: v_mov_b32_e32 v1, v33 -; GCN-NEXT: v_mov_b32_e32 v2, v32 -; GCN-NEXT: v_mov_b32_e32 v3, v38 -; GCN-NEXT: v_mov_b32_e32 v6, v37 -; GCN-NEXT: v_mov_b32_e32 v7, v36 -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16bf16_to_v4f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: s_cbranch_scc0 .LBB83_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: s_cbranch_execnz .LBB83_3 +; SI-NEXT: .LBB83_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: .LBB83_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB83_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB83_2 ; -; VI-LABEL: bitcast_v4f64_to_v32i8: +; VI-LABEL: bitcast_v16bf16_to_v4f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v34, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr2 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr10 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr14 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB83_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB83_4 +; VI-NEXT: .LBB83_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB83_3: +; VI-NEXT: s_branch .LBB83_2 +; VI-NEXT: .LBB83_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v4f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB83_4 +; GFX9-NEXT: .LBB83_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v2 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v1 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v9 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v9, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v9 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_bfe_u32 v10, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_and_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB83_3: +; GFX9-NEXT: s_branch .LBB83_2 +; GFX9-NEXT: .LBB83_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v4f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB83_4 +; GFX11-NEXT: .LBB83_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s8, s7, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-NEXT: s_and_b32 s8, s6, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_and_b32 s7, s5, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1 +; GFX11-NEXT: s_and_b32 s5, s4, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v11, v6 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v9, v10 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s3 +; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v6, v8 +; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v10 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v8 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v3, v9, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s2 +; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v9 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_bfe_u32 v11, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_bfe_u32 v16, v14, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s2 +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s1 +; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v15, v9, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v1, v10, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v0, v12, 16, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB83_3: +; GFX11-NEXT: s_branch .LBB83_2 +; GFX11-NEXT: .LBB83_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + +define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v4f64_to_v32i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v5 +; SI-NEXT: v_mov_b32_e32 v34, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v7, v6, 24 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v7, v6, 8 +; SI-NEXT: v_alignbit_b32 v19, v35, v34, 24 +; SI-NEXT: v_alignbit_b32 v18, v35, v34, 16 +; SI-NEXT: v_alignbit_b32 v17, v35, v34, 8 +; SI-NEXT: v_alignbit_b32 v11, v3, v2, 24 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v2, 8 +; SI-NEXT: v_alignbit_b32 v38, v1, v0, 24 +; SI-NEXT: v_alignbit_b32 v32, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v33, v1, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; SI-NEXT: .LBB84_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 +; SI-NEXT: v_alignbit_b32 v27, v7, v6, 24 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v7, v6, 8 +; SI-NEXT: v_alignbit_b32 v19, v35, v34, 24 +; SI-NEXT: v_alignbit_b32 v18, v35, v34, 16 +; SI-NEXT: v_alignbit_b32 v17, v35, v34, 8 +; SI-NEXT: v_alignbit_b32 v11, v3, v2, 24 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v2, 8 +; SI-NEXT: v_alignbit_b32 v38, v1, v0, 24 +; SI-NEXT: v_alignbit_b32 v32, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v33, v1, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; SI-NEXT: .LBB84_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v16, v34 +; SI-NEXT: v_mov_b32_e32 v20, v35 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_mov_b32_e32 v1, v33 +; SI-NEXT: v_mov_b32_e32 v2, v32 +; SI-NEXT: v_mov_b32_e32 v3, v38 +; SI-NEXT: v_mov_b32_e32 v6, v37 +; SI-NEXT: v_mov_b32_e32 v7, v36 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr23 @@ -12481,7 +25522,7 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: s_cbranch_execz .LBB84_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -12507,9 +25548,9 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: .LBB42_2: ; %Flow +; VI-NEXT: .LBB84_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB42_4 +; VI-NEXT: s_cbranch_execz .LBB84_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 @@ -12539,7 +25580,7 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: .LBB42_4: ; %end +; VI-NEXT: .LBB84_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v1 ; VI-NEXT: v_mov_b32_e32 v8, v34 @@ -12587,7 +25628,7 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: s_cbranch_execz .LBB84_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -12613,9 +25654,9 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: .LBB42_2: ; %Flow +; GFX9-NEXT: .LBB84_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB42_4 +; GFX9-NEXT: s_cbranch_execz .LBB84_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 @@ -12645,7 +25686,7 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: .LBB42_4: ; %end +; GFX9-NEXT: .LBB84_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 @@ -12685,7 +25726,7 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB84_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[32:33] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[26:27] @@ -12703,9 +25744,9 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[18:19] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB42_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB84_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB84_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 @@ -12729,7 +25770,7 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB42_4: ; %end +; GFX11-TRUE16-NEXT: .LBB84_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -12787,7 +25828,7 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB84_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 @@ -12813,9 +25854,9 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB42_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB84_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB84_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 @@ -12847,7 +25888,7 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB42_4: ; %end +; GFX11-FAKE16-NEXT: .LBB84_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 @@ -12875,228 +25916,889 @@ end: ret <32 x i8> %phi } +define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f64_to_v32i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB85_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 +; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 +; SI-NEXT: v_alignbit_b32 v32, s23, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 +; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 +; SI-NEXT: v_alignbit_b32 v33, s21, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v34, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v35, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s25, s23, 24 +; SI-NEXT: s_lshr_b32 s24, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s23, 8 +; SI-NEXT: s_lshr_b32 s14, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 8 +; SI-NEXT: s_lshr_b32 s11, s19, 24 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 8 +; SI-NEXT: s_lshr_b32 s8, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB85_4 +; SI-NEXT: .LBB85_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[24:25], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[16:17], s[20:21], 1.0 +; SI-NEXT: v_alignbit_b32 v27, v25, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 8 +; SI-NEXT: v_alignbit_b32 v19, v17, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 8 +; SI-NEXT: v_alignbit_b32 v11, v9, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v1, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v35, v1, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; SI-NEXT: s_branch .LBB85_5 +; SI-NEXT: .LBB85_3: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: s_branch .LBB85_2 +; SI-NEXT: .LBB85_4: +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v25, s23 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v15, s11 +; SI-NEXT: v_mov_b32_e32 v14, s10 +; SI-NEXT: v_mov_b32_e32 v13, s9 +; SI-NEXT: v_mov_b32_e32 v23, s14 +; SI-NEXT: v_mov_b32_e32 v22, s13 +; SI-NEXT: v_mov_b32_e32 v21, s12 +; SI-NEXT: v_mov_b32_e32 v31, s25 +; SI-NEXT: v_mov_b32_e32 v30, s24 +; SI-NEXT: v_mov_b32_e32 v29, s15 +; SI-NEXT: .LBB85_5: ; %end +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v12, v9 +; SI-NEXT: v_mov_b32_e32 v20, v17 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v1, v35 +; SI-NEXT: v_mov_b32_e32 v9, v34 +; SI-NEXT: v_mov_b32_e32 v17, v33 +; SI-NEXT: v_mov_b32_e32 v25, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v32i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB85_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s43, s23, 24 +; VI-NEXT: s_lshr_b32 s42, s23, 16 +; VI-NEXT: s_lshr_b32 s41, s23, 8 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s44, s22, 8 +; VI-NEXT: s_lshr_b32 s40, s21, 24 +; VI-NEXT: s_lshr_b32 s29, s21, 16 +; VI-NEXT: s_lshr_b32 s28, s21, 8 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s46, s20, 8 +; VI-NEXT: s_lshr_b32 s27, s19, 24 +; VI-NEXT: s_lshr_b32 s26, s19, 16 +; VI-NEXT: s_lshr_b32 s25, s19, 8 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s56, s18, 8 +; VI-NEXT: s_lshr_b32 s24, s17, 24 +; VI-NEXT: s_lshr_b32 s15, s17, 16 +; VI-NEXT: s_lshr_b32 s14, s17, 8 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_lshr_b32 s58, s16, 8 +; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB85_4 +; VI-NEXT: .LBB85_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[24:25], s[22:23], 1.0 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v25 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; VI-NEXT: s_branch .LBB85_5 +; VI-NEXT: .LBB85_3: +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: s_branch .LBB85_2 +; VI-NEXT: .LBB85_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v24, s22 +; VI-NEXT: v_mov_b32_e32 v25, s23 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v9, s19 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s59 +; VI-NEXT: v_mov_b32_e32 v35, s58 +; VI-NEXT: v_mov_b32_e32 v10, s57 +; VI-NEXT: v_mov_b32_e32 v34, s56 +; VI-NEXT: v_mov_b32_e32 v18, s47 +; VI-NEXT: v_mov_b32_e32 v33, s46 +; VI-NEXT: v_mov_b32_e32 v26, s45 +; VI-NEXT: v_mov_b32_e32 v32, s44 +; VI-NEXT: v_mov_b32_e32 v31, s43 +; VI-NEXT: v_mov_b32_e32 v30, s42 +; VI-NEXT: v_mov_b32_e32 v29, s41 +; VI-NEXT: v_mov_b32_e32 v23, s40 +; VI-NEXT: v_mov_b32_e32 v22, s29 +; VI-NEXT: v_mov_b32_e32 v21, s28 +; VI-NEXT: v_mov_b32_e32 v15, s27 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v7, s24 +; VI-NEXT: v_mov_b32_e32 v6, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v19, s8 +; VI-NEXT: v_mov_b32_e32 v27, s10 +; VI-NEXT: .LBB85_5: ; %end +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v12, v9 +; VI-NEXT: v_mov_b32_e32 v20, v17 +; VI-NEXT: v_mov_b32_e32 v28, v25 +; VI-NEXT: v_mov_b32_e32 v1, v35 +; VI-NEXT: v_mov_b32_e32 v9, v34 +; VI-NEXT: v_mov_b32_e32 v17, v33 +; VI-NEXT: v_mov_b32_e32 v25, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v32i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s43, s23, 24 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s41, s23, 8 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s44, s22, 8 +; GFX9-NEXT: s_lshr_b32 s40, s21, 24 +; GFX9-NEXT: s_lshr_b32 s29, s21, 16 +; GFX9-NEXT: s_lshr_b32 s28, s21, 8 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s46, s20, 8 +; GFX9-NEXT: s_lshr_b32 s27, s19, 24 +; GFX9-NEXT: s_lshr_b32 s26, s19, 16 +; GFX9-NEXT: s_lshr_b32 s25, s19, 8 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s56, s18, 8 +; GFX9-NEXT: s_lshr_b32 s24, s17, 24 +; GFX9-NEXT: s_lshr_b32 s15, s17, 16 +; GFX9-NEXT: s_lshr_b32 s14, s17, 8 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_lshr_b32 s58, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB85_4 +; GFX9-NEXT: .LBB85_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], s[22:23], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; GFX9-NEXT: s_branch .LBB85_5 +; GFX9-NEXT: .LBB85_3: +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr24 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: s_branch .LBB85_2 +; GFX9-NEXT: .LBB85_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v24, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v9, s19 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s59 +; GFX9-NEXT: v_mov_b32_e32 v35, s58 +; GFX9-NEXT: v_mov_b32_e32 v10, s57 +; GFX9-NEXT: v_mov_b32_e32 v34, s56 +; GFX9-NEXT: v_mov_b32_e32 v18, s47 +; GFX9-NEXT: v_mov_b32_e32 v33, s46 +; GFX9-NEXT: v_mov_b32_e32 v26, s45 +; GFX9-NEXT: v_mov_b32_e32 v32, s44 +; GFX9-NEXT: v_mov_b32_e32 v31, s43 +; GFX9-NEXT: v_mov_b32_e32 v30, s42 +; GFX9-NEXT: v_mov_b32_e32 v29, s41 +; GFX9-NEXT: v_mov_b32_e32 v23, s40 +; GFX9-NEXT: v_mov_b32_e32 v22, s29 +; GFX9-NEXT: v_mov_b32_e32 v21, s28 +; GFX9-NEXT: v_mov_b32_e32 v15, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v7, s24 +; GFX9-NEXT: v_mov_b32_e32 v6, s15 +; GFX9-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v19, s8 +; GFX9-NEXT: v_mov_b32_e32 v27, s10 +; GFX9-NEXT: .LBB85_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-NEXT: v_mov_b32_e32 v28, v25 +; GFX9-NEXT: v_mov_b32_e32 v1, v35 +; GFX9-NEXT: v_mov_b32_e32 v9, v34 +; GFX9-NEXT: v_mov_b32_e32 v17, v33 +; GFX9-NEXT: v_mov_b32_e32 v25, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4f64_to_v32i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB85_4 +; GFX11-TRUE16-NEXT: .LBB85_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[27:28], s[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], s[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], s[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[3:4], s[0:1], 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[32:33], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[19:20] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB85_5 +; GFX11-TRUE16-NEXT: .LBB85_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB85_2 +; GFX11-TRUE16-NEXT: .LBB85_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s28 +; GFX11-TRUE16-NEXT: .LBB85_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4f64_to_v32i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB85_4 +; GFX11-FAKE16-NEXT: .LBB85_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[32:33], s[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[34:35], s[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[36:37], s[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[38:39], s[0:1], 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-FAKE16-NEXT: s_branch .LBB85_5 +; GFX11-FAKE16-NEXT: .LBB85_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: s_branch .LBB85_2 +; GFX11-FAKE16-NEXT: .LBB85_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s0 :: v_dual_mov_b32 v33, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s2 :: v_dual_mov_b32 v35, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s16 :: v_dual_mov_b32 v37, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v39, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s46 :: v_dual_mov_b32 v1, s45 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s44 :: v_dual_mov_b32 v9, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s42 :: v_dual_mov_b32 v17, s41 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s40 :: v_dual_mov_b32 v25, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v30, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s6 :: v_dual_mov_b32 v22, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s8 :: v_dual_mov_b32 v14, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s10 :: v_dual_mov_b32 v6, s14 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, s28 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v29, s26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v23, s25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v21, s23 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s22 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, s20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, s15 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s13 +; GFX11-FAKE16-NEXT: .LBB85_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v37 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v34 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, v35 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v24, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i8_to_v4f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB43_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB43_4 -; GCN-NEXT: .LBB43_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB43_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v0, v0, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v12, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v16, v16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v20, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v9, v13, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v13, v17, v18 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v15, v19, v21 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: v_or_b32_e32 v4, v8, v9 -; GCN-NEXT: v_or_b32_e32 v5, v10, v11 -; GCN-NEXT: v_or_b32_e32 v6, v12, v13 -; GCN-NEXT: v_or_b32_e32 v7, v14, v15 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB43_2 -; GCN-NEXT: .LBB43_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v0, v37, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v39, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v48, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v49, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v21, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v16, v23, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v25, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v13, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v16 -; GCN-NEXT: v_or_b32_e32 v13, v17, v18 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x300, v20 -; GCN-NEXT: v_or_b32_e32 v15, v19, v21 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i8_to_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB86_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB86_4 +; SI-NEXT: .LBB86_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB86_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB86_2 +; SI-NEXT: .LBB86_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v5, v19, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v6, v15, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v4f64: ; VI: ; %bb.0: @@ -13129,14 +26831,14 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: s_cbranch_execnz .LBB86_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB43_4 -; VI-NEXT: .LBB43_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB86_4 +; VI-NEXT: .LBB86_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB43_3: ; %cmp.false +; VI-NEXT: .LBB86_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -13194,8 +26896,8 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB43_2 -; VI-NEXT: .LBB43_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB86_2 +; VI-NEXT: .LBB86_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v31 ; VI-NEXT: v_add_u16_e32 v1, 3, v32 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -13287,14 +26989,14 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: s_cbranch_execnz .LBB86_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB43_4 -; GFX9-NEXT: .LBB43_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB86_4 +; GFX9-NEXT: .LBB86_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB43_3: ; %cmp.false +; GFX9-NEXT: .LBB86_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -13352,8 +27054,8 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB43_2 -; GFX9-NEXT: .LBB43_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB86_2 +; GFX9-NEXT: .LBB86_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -13453,14 +27155,14 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_4 -; GFX11-TRUE16-NEXT: .LBB43_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_4 +; GFX11-TRUE16-NEXT: .LBB86_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB43_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h @@ -13552,8 +27254,8 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB43_2 -; GFX11-TRUE16-NEXT: .LBB43_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 +; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 @@ -13678,14 +27380,14 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_4 -; GFX11-FAKE16-NEXT: .LBB43_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_4 +; GFX11-FAKE16-NEXT: .LBB86_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB43_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 @@ -13775,8 +27477,8 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB43_2 -; GFX11-FAKE16-NEXT: .LBB43_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB86_2 +; GFX11-FAKE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -13884,139 +27586,1081 @@ end: ret <4 x double> %phi } -define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i16_to_v16f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v31, v15 -; GCN-NEXT: v_mov_b32_e32 v30, v14 -; GCN-NEXT: v_mov_b32_e32 v29, v13 -; GCN-NEXT: v_mov_b32_e32 v28, v12 -; GCN-NEXT: v_mov_b32_e32 v27, v11 -; GCN-NEXT: v_mov_b32_e32 v26, v10 -; GCN-NEXT: v_mov_b32_e32 v25, v9 -; GCN-NEXT: v_mov_b32_e32 v24, v8 -; GCN-NEXT: v_mov_b32_e32 v23, v7 -; GCN-NEXT: v_mov_b32_e32 v22, v6 -; GCN-NEXT: v_mov_b32_e32 v21, v5 -; GCN-NEXT: v_mov_b32_e32 v20, v4 -; GCN-NEXT: v_mov_b32_e32 v19, v3 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v1 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB44_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB44_4 -; GCN-NEXT: .LBB44_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB44_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB44_2 -; GCN-NEXT: .LBB44_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i8_to_v4f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v17 +; SI-NEXT: s_cbranch_scc0 .LBB87_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v23, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB87_3 +; SI-NEXT: .LBB87_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v21 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB87_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB87_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB87_2 ; -; VI-LABEL: bitcast_v16i16_to_v16f16: +; VI-LABEL: bitcast_v32i8_to_v4f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB44_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v8, 3 -; VI-NEXT: v_add_u16_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v10, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v11, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v12, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v13, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v14, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v23, v5 +; VI-NEXT: v_mov_b32_e32 v24, v4 +; VI-NEXT: v_mov_b32_e32 v21, v2 +; VI-NEXT: v_mov_b32_e32 v20, v1 +; VI-NEXT: v_mov_b32_e32 v19, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; VI-NEXT: s_cbranch_scc0 .LBB87_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; VI-NEXT: v_or_b32_sdwa v0, v21, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 +; VI-NEXT: v_or_b32_sdwa v2, v22, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB87_3 +; VI-NEXT: .LBB87_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s19, 24 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v19 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s28, 0xff +; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v13 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v21 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; VI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v17 +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v23 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_or_b32_sdwa v9, v25, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v5, v5, v8 +; VI-NEXT: v_or_b32_sdwa v8, v18, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v3, v3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x300, v13 +; VI-NEXT: v_or_b32_e32 v4, v4, v12 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v8 +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB87_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB87_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_branch .LBB87_2 +; +; GFX9-LABEL: bitcast_v32i8_to_v4f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v4 +; GFX9-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v17 +; GFX9-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB87_3 +; GFX9-NEXT: .LBB87_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v19 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB87_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB87_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_branch .LBB87_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v4f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB87_3 +; GFX11-TRUE16-NEXT: .LBB87_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v21 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v8 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: .LBB87_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB87_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-TRUE16-NEXT: s_branch .LBB87_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v4f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v6 :: v_dual_mov_b32 v17, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v15, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v14 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v20 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB87_3 +; GFX11-FAKE16-NEXT: .LBB87_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v16 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v17 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v18 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v14, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v9, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v15 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v20, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: .LBB87_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB87_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-FAKE16-NEXT: s_branch .LBB87_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + +define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v16i16_to_v16f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v31, v15 +; SI-NEXT: v_mov_b32_e32 v30, v14 +; SI-NEXT: v_mov_b32_e32 v29, v13 +; SI-NEXT: v_mov_b32_e32 v28, v12 +; SI-NEXT: v_mov_b32_e32 v27, v11 +; SI-NEXT: v_mov_b32_e32 v26, v10 +; SI-NEXT: v_mov_b32_e32 v25, v9 +; SI-NEXT: v_mov_b32_e32 v24, v8 +; SI-NEXT: v_mov_b32_e32 v23, v7 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v5 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v1 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB88_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB88_4 +; SI-NEXT: .LBB88_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB88_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_2 +; SI-NEXT: .LBB88_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i16_to_v16f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB88_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v8, 3 +; VI-NEXT: v_add_u16_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v10, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v11, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v12, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v13, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v14, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v15, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v8, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v7, 3, v7 @@ -14035,7 +28679,7 @@ define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v11 ; VI-NEXT: v_or_b32_e32 v1, v1, v10 ; VI-NEXT: v_or_b32_e32 v0, v0, v9 -; VI-NEXT: .LBB44_2: ; %end +; VI-NEXT: .LBB88_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -14067,7 +28711,7 @@ define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-NEXT: s_cbranch_execz .LBB88_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] @@ -14077,7 +28721,7 @@ define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB44_2: ; %end +; GFX11-NEXT: .LBB88_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -14097,103 +28741,323 @@ end: ret <16 x half> %phi } +define inreg <16 x half> @bitcast_v16i16_to_v16f16_scalar(<16 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i16_to_v16f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB89_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: s_cbranch_execnz .LBB89_3 +; SI-NEXT: .LBB89_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v17 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: .LBB89_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB89_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB89_2 +; +; VI-LABEL: bitcast_v16i16_to_v16f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB89_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB89_3 +; VI-NEXT: .LBB89_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB89_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB89_4: +; VI-NEXT: s_branch .LBB89_2 +; +; GFX9-LABEL: bitcast_v16i16_to_v16f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB89_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB89_4 +; GFX9-NEXT: .LBB89_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB89_3: +; GFX9-NEXT: s_branch .LBB89_2 +; GFX9-NEXT: .LBB89_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v16f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB89_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB89_4 +; GFX11-NEXT: .LBB89_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB89_3: +; GFX11-NEXT: s_branch .LBB89_2 +; GFX11-NEXT: .LBB89_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f16_to_v16i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB45_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_or_b32_e32 v14, v14, v16 -; GCN-NEXT: v_or_b32_e32 v10, v10, v17 -; GCN-NEXT: v_or_b32_e32 v6, v6, v18 -; GCN-NEXT: v_or_b32_e32 v2, v2, v19 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: .LBB45_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f16_to_v16i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB90_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: .LBB90_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v16i16: ; VI: ; %bb.0: @@ -14202,7 +29066,7 @@ define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB45_2 +; VI-NEXT: s_cbranch_execz .LBB90_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v9, 0x200 ; VI-NEXT: v_add_f16_e32 v8, 0x200, v0 @@ -14229,7 +29093,7 @@ define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v11, v2 ; VI-NEXT: v_or_b32_e32 v1, v10, v1 ; VI-NEXT: v_or_b32_e32 v0, v8, v0 -; VI-NEXT: .LBB45_2: ; %end +; VI-NEXT: .LBB90_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -14262,7 +29126,7 @@ define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB45_2 +; GFX11-NEXT: s_cbranch_execz .LBB90_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] @@ -14272,7 +29136,7 @@ define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB45_2: ; %end +; GFX11-NEXT: .LBB90_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -14292,115 +29156,359 @@ end: ret <16 x i16> %phi } +define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f16_to_v16i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB91_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB91_3 +; SI-NEXT: .LBB91_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: .LBB91_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB91_4: +; SI-NEXT: s_branch .LBB91_2 +; +; VI-LABEL: bitcast_v16f16_to_v16i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB91_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB91_4 +; VI-NEXT: .LBB91_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: s_lshr_b32 s5, s22, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: s_lshr_b32 s5, s23, 16 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_add_f16_e32 v5, s22, v0 +; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v13, s5 +; VI-NEXT: v_add_f16_e32 v7, s23, v0 +; VI-NEXT: v_add_f16_sdwa v13, v13, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v5, v6 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v7, v7, v13 +; VI-NEXT: v_add_f16_sdwa v13, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_add_f16_e32 v8, s16, v0 +; VI-NEXT: v_add_f16_sdwa v9, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v10, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v11, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v12, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: v_add_f16_sdwa v5, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s21, v0 +; VI-NEXT: v_or_b32_e32 v5, v0, v5 +; VI-NEXT: v_or_b32_e32 v4, v4, v13 +; VI-NEXT: v_or_b32_e32 v3, v3, v12 +; VI-NEXT: v_or_b32_e32 v2, v2, v11 +; VI-NEXT: v_or_b32_e32 v1, v1, v10 +; VI-NEXT: v_or_b32_e32 v0, v8, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB91_3: +; VI-NEXT: s_branch .LBB91_2 +; VI-NEXT: .LBB91_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v16i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB91_4 +; GFX9-NEXT: .LBB91_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB91_3: +; GFX9-NEXT: s_branch .LBB91_2 +; GFX9-NEXT: .LBB91_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v16i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB91_4 +; GFX11-NEXT: .LBB91_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB91_3: +; GFX11-NEXT: s_branch .LBB91_2 +; GFX11-NEXT: .LBB91_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i16_to_v16bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v23, v14 -; GCN-NEXT: v_mov_b32_e32 v22, v12 -; GCN-NEXT: v_mov_b32_e32 v21, v10 -; GCN-NEXT: v_mov_b32_e32 v20, v8 -; GCN-NEXT: v_mov_b32_e32 v19, v6 -; GCN-NEXT: v_mov_b32_e32 v18, v4 -; GCN-NEXT: v_mov_b32_e32 v17, v2 -; GCN-NEXT: v_mov_b32_e32 v24, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB46_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB46_4 -; GCN-NEXT: .LBB46_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB46_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB46_2 -; GCN-NEXT: .LBB46_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v23 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v24 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v15, v0 -; GCN-NEXT: v_or_b32_e32 v2, v13, v2 -; GCN-NEXT: v_or_b32_e32 v4, v11, v4 -; GCN-NEXT: v_or_b32_e32 v6, v9, v6 -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_or_b32_e32 v5, v5, v10 -; GCN-NEXT: v_or_b32_e32 v3, v3, v12 -; GCN-NEXT: v_or_b32_e32 v1, v1, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i16_to_v16bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v14 +; SI-NEXT: v_mov_b32_e32 v22, v12 +; SI-NEXT: v_mov_b32_e32 v21, v10 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v2 +; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB92_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB92_4 +; SI-NEXT: .LBB92_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB92_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_2 +; SI-NEXT: .LBB92_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i16_to_v16bf16: ; VI: ; %bb.0: @@ -14409,7 +29517,7 @@ define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB46_2 +; VI-NEXT: s_cbranch_execz .LBB92_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v8, 3 ; VI-NEXT: v_add_u16_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -14436,7 +29544,7 @@ define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v11 ; VI-NEXT: v_or_b32_e32 v1, v1, v10 ; VI-NEXT: v_or_b32_e32 v0, v0, v9 -; VI-NEXT: .LBB46_2: ; %end +; VI-NEXT: .LBB92_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -14468,7 +29576,7 @@ define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB46_2 +; GFX11-NEXT: s_cbranch_execz .LBB92_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] @@ -14478,7 +29586,7 @@ define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB46_2: ; %end +; GFX11-NEXT: .LBB92_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -14498,146 +29606,400 @@ end: ret <16 x bfloat> %phi } +define inreg <16 x bfloat> @bitcast_v16i16_to_v16bf16_scalar(<16 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i16_to_v16bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: s_cbranch_scc0 .LBB93_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_lshl_b32 s8, s18, 16 +; SI-NEXT: s_lshl_b32 s9, s19, 16 +; SI-NEXT: s_lshl_b32 s10, s20, 16 +; SI-NEXT: s_lshl_b32 s11, s21, 16 +; SI-NEXT: s_lshl_b32 s12, s22, 16 +; SI-NEXT: s_lshl_b32 s13, s23, 16 +; SI-NEXT: s_lshl_b32 s14, s24, 16 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_lshl_b32 s40, s26, 16 +; SI-NEXT: s_lshl_b32 s41, s27, 16 +; SI-NEXT: s_lshl_b32 s42, s28, 16 +; SI-NEXT: s_lshl_b32 s43, s29, 16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; SI-NEXT: s_cbranch_execnz .LBB93_3 +; SI-NEXT: .LBB93_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s26, 0xffff +; SI-NEXT: s_lshl_b32 s6, s27, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xffff +; SI-NEXT: s_lshl_b32 s7, s25, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s14, s6, 0x30000 +; SI-NEXT: s_and_b32 s6, s22, 0xffff +; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s12, s6, 0x30000 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s10, s6, 0x30000 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s19, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s8, s6, 0x30000 +; SI-NEXT: s_and_b32 s6, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s11, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_and_b32 s13, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s15, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_and_b32 s41, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s5, 16 +; SI-NEXT: s_and_b32 s43, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s4, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; SI-NEXT: .LBB93_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s40 +; SI-NEXT: v_mov_b32_e32 v11, s41 +; SI-NEXT: v_mov_b32_e32 v12, s42 +; SI-NEXT: v_mov_b32_e32 v13, s43 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB93_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_branch .LBB93_2 +; +; VI-LABEL: bitcast_v16i16_to_v16bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB93_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB93_3 +; VI-NEXT: .LBB93_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB93_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB93_4: +; VI-NEXT: s_branch .LBB93_2 +; +; GFX9-LABEL: bitcast_v16i16_to_v16bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB93_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB93_4 +; GFX9-NEXT: .LBB93_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB93_3: +; GFX9-NEXT: s_branch .LBB93_2 +; GFX9-NEXT: .LBB93_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v16bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB93_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB93_4 +; GFX11-NEXT: .LBB93_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB93_3: +; GFX11-NEXT: s_branch .LBB93_2 +; GFX11-NEXT: .LBB93_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v16bf16_to_v16i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v15 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB47_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB47_4 -; GCN-NEXT: .LBB47_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB47_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v22 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB47_2 -; GCN-NEXT: .LBB47_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v10 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v12 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_alignbit_b32 v0, v14, v0, 16 -; GCN-NEXT: v_alignbit_b32 v4, v19, v2, 16 -; GCN-NEXT: v_alignbit_b32 v8, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v12, v21, v5, 16 -; GCN-NEXT: v_alignbit_b32 v14, v15, v17, 16 -; GCN-NEXT: v_alignbit_b32 v10, v11, v9, 16 -; GCN-NEXT: v_alignbit_b32 v6, v7, v18, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v13, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v24, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v23, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v22, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16bf16_to_v16i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v15 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB94_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB94_4 +; SI-NEXT: .LBB94_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB94_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v22 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB94_2 +; SI-NEXT: .LBB94_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 +; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16bf16_to_v16i16: ; VI: ; %bb.0: @@ -14646,7 +30008,7 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB47_2 +; VI-NEXT: s_cbranch_execz .LBB94_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -14793,7 +30155,7 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v2, v2, v10, 16 ; VI-NEXT: v_alignbit_b32 v1, v1, v9, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; VI-NEXT: .LBB47_2: ; %end +; VI-NEXT: .LBB94_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -14804,7 +30166,7 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB47_2 +; GFX9-NEXT: s_cbranch_execz .LBB94_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -14928,7 +30290,7 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v2, v10, s6 ; GFX9-NEXT: v_perm_b32 v1, v1, v9, s6 ; GFX9-NEXT: v_perm_b32 v0, v0, v8, s6 -; GFX9-NEXT: .LBB47_2: ; %end +; GFX9-NEXT: .LBB94_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -14940,7 +30302,7 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0 @@ -15093,7 +30455,7 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v10 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v11, 16, v9 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v12, 16, v8 -; GFX11-TRUE16-NEXT: .LBB47_2: ; %end +; GFX11-TRUE16-NEXT: .LBB94_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15105,7 +30467,7 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -15240,7 +30602,7 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v13, v16, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v11, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB47_2: ; %end +; GFX11-FAKE16-NEXT: .LBB94_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -15260,277 +30622,915 @@ end: ret <16 x i16> %phi } -define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i16_to_v32i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v48, v15 -; GCN-NEXT: v_mov_b32_e32 v32, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v12 -; GCN-NEXT: v_mov_b32_e32 v49, v11 -; GCN-NEXT: v_mov_b32_e32 v33, v10 -; GCN-NEXT: v_mov_b32_e32 v36, v8 -; GCN-NEXT: v_mov_b32_e32 v50, v7 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v38, v4 -; GCN-NEXT: v_mov_b32_e32 v51, v3 -; GCN-NEXT: v_mov_b32_e32 v35, v2 -; GCN-NEXT: v_mov_b32_e32 v39, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v49 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v48 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB48_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v51 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v48 -; GCN-NEXT: v_bfe_u32 v7, v51, 8, 8 -; GCN-NEXT: v_bfe_u32 v15, v50, 8, 8 -; GCN-NEXT: v_bfe_u32 v23, v49, 8, 8 -; GCN-NEXT: v_or_b32_e32 v0, v0, v52 -; GCN-NEXT: v_or_b32_e32 v4, v1, v53 -; GCN-NEXT: v_or_b32_e32 v8, v2, v54 -; GCN-NEXT: v_or_b32_e32 v12, v3, v55 -; GCN-NEXT: v_or_b32_e32 v16, v5, v40 -; GCN-NEXT: v_or_b32_e32 v20, v9, v41 -; GCN-NEXT: v_or_b32_e32 v24, v10, v42 -; GCN-NEXT: v_or_b32_e32 v28, v11, v43 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_bfe_u32 v31, v48, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: .LBB48_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB48_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v37 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v35 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_or_b32_e32 v1, v43, v1 -; GCN-NEXT: v_or_b32_e32 v2, v40, v2 -; GCN-NEXT: v_or_b32_e32 v3, v41, v3 -; GCN-NEXT: v_or_b32_e32 v4, v54, v4 -; GCN-NEXT: v_or_b32_e32 v5, v55, v5 -; GCN-NEXT: v_or_b32_e32 v6, v52, v6 -; GCN-NEXT: v_or_b32_e32 v7, v53, v7 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v7 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: .LBB48_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16bf16_to_v16i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v1 +; SI-NEXT: s_cbranch_scc0 .LBB95_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v22 +; SI-NEXT: s_cbranch_execnz .LBB95_3 +; SI-NEXT: .LBB95_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 +; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: .LBB95_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB95_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB95_2 ; -; VI-LABEL: bitcast_v16i16_to_v32i8: +; VI-LABEL: bitcast_v16bf16_to_v16i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr8 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB95_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v5 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[4:5] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[0:1] -; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_mov_b32_e32 v48, v1 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v16, v4 -; VI-NEXT: v_mov_b32_e32 v49, v5 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v51, v7 -; VI-NEXT: ; implicit-def: $vgpr1 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr7 -; VI-NEXT: .LBB48_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v9, 3 -; VI-NEXT: v_add_u16_sdwa v36, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v32, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v14, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_cbranch_execnz .LBB95_4 +; VI-NEXT: .LBB95_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v1 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v4, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v1 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_f32_e32 v6, s4, v1 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: s_lshl_b32 s5, s22, 16 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v6, s5, v1 +; VI-NEXT: v_cndmask_b32_e32 v11, v7, v11, vcc +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc +; VI-NEXT: v_add_f32_e32 v7, s5, v1 +; VI-NEXT: v_bfe_u32 v12, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v7 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc +; VI-NEXT: s_lshl_b32 s5, s23, 16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; VI-NEXT: v_add_f32_e32 v7, s5, v1 +; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc +; VI-NEXT: v_add_f32_e32 v13, s5, v1 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v6, v12, v6, 16 +; VI-NEXT: v_add_f32_e32 v12, s4, v1 +; VI-NEXT: v_alignbit_b32 v7, v13, v7, 16 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; VI-NEXT: v_add_f32_e32 v13, s4, v1 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v5, v1, v13, 16 +; VI-NEXT: v_alignbit_b32 v4, v4, v11, 16 +; VI-NEXT: v_alignbit_b32 v3, v3, v10, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v9, 16 +; VI-NEXT: v_alignbit_b32 v1, v15, v8, 16 +; VI-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB95_3: +; VI-NEXT: s_branch .LBB95_2 +; VI-NEXT: .LBB95_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v16i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB95_4 +; GFX9-NEXT: .LBB95_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: s_and_b32 s5, s22, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v4, s5, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: s_lshl_b32 s5, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_add_f32_e32 v5, s5, v1 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: s_and_b32 s5, s23, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_add_f32_e32 v6, s5, v1 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: s_lshl_b32 s5, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v13, vcc +; GFX9-NEXT: v_add_f32_e32 v7, s5, v1 +; GFX9-NEXT: v_bfe_u32 v13, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v13, v13, v7 +; GFX9-NEXT: v_add_u32_e32 v13, 0x7fff, v13 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_mov_b32_e32 v13, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_or_b32 v7, v6, v13, v7 +; GFX9-NEXT: v_and_or_b32 v6, v4, v13, v5 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v14, vcc +; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 +; GFX9-NEXT: v_bfe_u32 v14, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v14, v14, v5 +; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v15, vcc +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_or_b32 v5, v4, v13, v5 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 +; GFX9-NEXT: v_bfe_u32 v14, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v14, v14, v4 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v14, v15, vcc +; GFX9-NEXT: v_bfe_u32 v14, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v14, v14, v1 +; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v4, v4, v13, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; GFX9-NEXT: v_and_or_b32 v3, v3, v13, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; GFX9-NEXT: v_and_or_b32 v2, v2, v13, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_and_or_b32 v1, v9, v13, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v13, v8 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB95_3: +; GFX9-NEXT: s_branch .LBB95_2 +; GFX9-NEXT: .LBB95_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v16i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB95_4 +; GFX11-NEXT: .LBB95_2: ; %cmp.true +; GFX11-NEXT: s_and_b32 s8, s0, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8 +; GFX11-NEXT: s_and_b32 s8, s1, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s8 +; GFX11-NEXT: s_lshl_b32 s0, s1, 16 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: s_and_b32 s1, s2, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-NEXT: s_and_b32 s1, s5, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v8, v6 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v5, v3, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v3, v4, v7 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s3, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: s_lshl_b32 s0, s3, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v10, v4 +; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v3, v11, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_and_b32 s0, s4, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v12, v5 +; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s4, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v10, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v12, v7 +; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: s_lshl_b32 s0, s5, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v14, v10 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10 +; GFX11-NEXT: v_bfe_u32 v15, v11, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v5, v12, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v14, v15, v11 +; GFX11-NEXT: s_and_b32 s0, s6, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v16, v5, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v13, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v7, v17, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s6, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v5 +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s7, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v11, v14, v18 :: v_dual_add_nc_u32 v14, 0x7fff, v16 +; GFX11-NEXT: v_bfe_u32 v16, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v7 +; GFX11-NEXT: s_and_b32 s0, s7, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v14, v18 :: v_dual_add_nc_u32 v14, v20, v17 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v20, v22, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_bfe_u32 v18, v21, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v14, v14, v23, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v24, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v5, 0xffff0000, v11, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v19 +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v3, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v6, v15, vcc_lo +; GFX11-NEXT: v_and_or_b32 v6, 0xffff0000, v16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX11-NEXT: v_and_or_b32 v7, 0xffff0000, v7, v17 +; GFX11-NEXT: v_and_or_b32 v4, 0xffff0000, v13, v10 +; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v8, v12 +; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v9, v14 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v15 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB95_3: +; GFX11-NEXT: s_branch .LBB95_2 +; GFX11-NEXT: .LBB95_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + +define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v16i16_to_v32i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v49, v11 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v3 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v36, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v38, v4 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v39, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v50 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v48 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB96_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v8, v5, v55 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v12, v5, v54 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v5, v41 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v20, v5, v40 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v24, v5, v43 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v53 +; SI-NEXT: v_or_b32_e32 v4, v1, v52 +; SI-NEXT: v_or_b32_e32 v28, v5, v42 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v48 +; SI-NEXT: v_bfe_u32 v7, v51, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v50, 8, 8 +; SI-NEXT: v_bfe_u32 v23, v49, 8, 8 +; SI-NEXT: v_bfe_u32 v31, v48, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: .LBB96_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB96_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: .LBB96_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i16_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB96_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v5 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[0:1] +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v16, v4 +; VI-NEXT: v_mov_b32_e32 v49, v5 +; VI-NEXT: v_mov_b32_e32 v24, v6 +; VI-NEXT: v_mov_b32_e32 v51, v7 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: .LBB96_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB96_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v9, 3 +; VI-NEXT: v_add_u16_sdwa v36, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v32, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v14, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v22, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v18, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -15576,7 +31576,7 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; VI-NEXT: v_bfe_u32 v23, v22, 8, 8 ; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; VI-NEXT: v_bfe_u32 v39, v36, 8, 8 -; VI-NEXT: .LBB48_4: ; %end +; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v50 ; VI-NEXT: v_mov_b32_e32 v1, v38 @@ -15625,7 +31625,7 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB48_2 +; GFX9-NEXT: s_cbranch_execz .LBB96_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -15651,9 +31651,9 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: .LBB48_2: ; %Flow +; GFX9-NEXT: .LBB96_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB48_4 +; GFX9-NEXT: s_cbranch_execz .LBB96_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] @@ -15687,7 +31687,7 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: .LBB48_4: ; %end +; GFX9-NEXT: .LBB96_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 @@ -15727,7 +31727,7 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[32:33] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[26:27] @@ -15745,9 +31745,9 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[18:19] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB96_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v33, v33, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] @@ -15773,7 +31773,7 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB48_4: ; %end +; GFX11-TRUE16-NEXT: .LBB96_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -15831,7 +31831,7 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 @@ -15857,9 +31857,9 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB96_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v39, v39, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v37, v37, 3 op_sel_hi:[1,0] @@ -15893,7 +31893,7 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB48_4: ; %end +; GFX11-FAKE16-NEXT: .LBB96_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 @@ -15921,330 +31921,1084 @@ end: ret <32 x i8> %phi } -define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i8_to_v16i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v17 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v25 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v1, v1, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v4, v4, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v7, v7, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v0, v0, v51 -; GCN-NEXT: v_or_b32_e32 v8, v8, v52 -; GCN-NEXT: v_or_b32_e32 v14, v14, v54 -; GCN-NEXT: v_or_b32_e32 v15, v15, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v34, v5 -; GCN-NEXT: v_or_b32_e32 v6, v35, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v9, v36, v9 -; GCN-NEXT: v_or_b32_e32 v10, v37, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v12, v39, v12 -; GCN-NEXT: v_or_b32_e32 v13, v38, v13 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v19, v1, v3 -; GCN-NEXT: v_or_b32_e32 v23, v4, v5 -; GCN-NEXT: v_or_b32_e32 v27, v7, v9 -; GCN-NEXT: v_or_b32_e32 v31, v11, v12 -; GCN-NEXT: v_or_b32_e32 v17, v0, v2 -; GCN-NEXT: v_or_b32_e32 v21, v8, v6 -; GCN-NEXT: v_or_b32_e32 v25, v14, v10 -; GCN-NEXT: v_or_b32_e32 v29, v15, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GCN-NEXT: v_alignbit_b32 v1, v19, v2, 16 -; GCN-NEXT: v_alignbit_b32 v5, v23, v6, 16 -; GCN-NEXT: v_alignbit_b32 v9, v27, v10, 16 -; GCN-NEXT: v_alignbit_b32 v13, v31, v13, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB49_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v24 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v26 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v1, v55, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v5, v53, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v9, v54, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v13, v50, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v8, v52, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v49, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v0, v51, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; GCN-NEXT: v_or_b32_e32 v3, v38, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_or_b32_e32 v7, v39, v7 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_or_b32_e32 v11, v37, v11 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_or_b32_e32 v15, v36, v15 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v10, v35, v10 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_or_b32_e32 v14, v34, v14 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_or_b32_e32 v3, v7, v5 -; GCN-NEXT: v_or_b32_e32 v5, v11, v9 -; GCN-NEXT: v_or_b32_e32 v7, v15, v13 -; GCN-NEXT: v_or_b32_e32 v8, v10, v8 -; GCN-NEXT: v_or_b32_e32 v9, v14, v12 -; GCN-NEXT: v_or_b32_e32 v0, v2, v0 -; GCN-NEXT: v_or_b32_e32 v2, v6, v4 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v31, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v9 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v2 -; GCN-NEXT: v_alignbit_b32 v1, v19, v17, 16 -; GCN-NEXT: v_alignbit_b32 v5, v23, v21, 16 -; GCN-NEXT: v_alignbit_b32 v9, v27, v25, 16 -; GCN-NEXT: v_alignbit_b32 v13, v31, v29, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v31 -; GCN-NEXT: .LBB49_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v17 -; GCN-NEXT: v_mov_b32_e32 v2, v19 -; GCN-NEXT: v_mov_b32_e32 v4, v21 -; GCN-NEXT: v_mov_b32_e32 v6, v23 -; GCN-NEXT: v_mov_b32_e32 v8, v25 -; GCN-NEXT: v_mov_b32_e32 v10, v27 -; GCN-NEXT: v_mov_b32_e32 v12, v29 -; GCN-NEXT: v_mov_b32_e32 v14, v31 -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i16_to_v32i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; SI-NEXT: s_cbranch_scc0 .LBB97_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: v_alignbit_b32 v11, s9, v6, 24 +; SI-NEXT: v_alignbit_b32 v10, s9, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, s9, v6, 8 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_alignbit_b32 v19, s10, v6, 24 +; SI-NEXT: v_alignbit_b32 v18, s10, v6, 16 +; SI-NEXT: v_alignbit_b32 v17, s10, v6, 8 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: v_or_b32_e32 v28, v6, v5 +; SI-NEXT: v_alignbit_b32 v3, s12, v1, 24 +; SI-NEXT: v_alignbit_b32 v2, s12, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, s12, v1, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, s8, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, s8, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, s8, 8 +; SI-NEXT: s_lshr_b32 s44, s12, 8 +; SI-NEXT: s_lshr_b32 s14, s9, 8 +; SI-NEXT: s_lshr_b32 s41, s10, 8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_and_b32 s15, s23, 0xffff +; SI-NEXT: s_and_b32 s42, s27, 0xffff +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v4 +; SI-NEXT: s_bfe_u32 s13, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s40, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s43, s27, 0x80008 +; SI-NEXT: v_bfe_u32 v31, v4, 8, 8 +; SI-NEXT: s_cbranch_execnz .LBB97_3 +; SI-NEXT: .LBB97_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s8, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s10, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s9, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s11, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: s_add_i32 s12, s4, 0x30000 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_alignbit_b32 v11, s9, v4, 24 +; SI-NEXT: v_alignbit_b32 v10, s9, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, s9, v4, 8 +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_alignbit_b32 v3, s12, v1, 24 +; SI-NEXT: v_alignbit_b32 v2, s12, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, s12, v1, 8 +; SI-NEXT: v_alignbit_b32 v19, s10, v4, 24 +; SI-NEXT: v_alignbit_b32 v18, s10, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, s10, v4, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v0, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v0, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v0, 8 +; SI-NEXT: s_lshr_b32 s13, s12, 24 +; SI-NEXT: s_lshr_b32 s45, s12, 16 +; SI-NEXT: s_lshr_b32 s44, s12, 8 +; SI-NEXT: s_lshr_b32 s40, s9, 24 +; SI-NEXT: s_lshr_b32 s15, s9, 16 +; SI-NEXT: s_lshr_b32 s14, s9, 8 +; SI-NEXT: s_lshr_b32 s43, s10, 24 +; SI-NEXT: s_lshr_b32 s42, s10, 16 +; SI-NEXT: s_lshr_b32 s41, s10, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: .LBB97_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s44 +; SI-NEXT: v_mov_b32_e32 v6, s45 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v12, s9 +; SI-NEXT: v_mov_b32_e32 v13, s14 +; SI-NEXT: v_mov_b32_e32 v14, s15 +; SI-NEXT: v_mov_b32_e32 v15, s40 +; SI-NEXT: v_mov_b32_e32 v16, s7 +; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: v_mov_b32_e32 v21, s41 +; SI-NEXT: v_mov_b32_e32 v22, s42 +; SI-NEXT: v_mov_b32_e32 v23, s43 +; SI-NEXT: v_mov_b32_e32 v24, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB97_4: +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB97_2 ; -; VI-LABEL: bitcast_v32i8_to_v16i16: +; VI-LABEL: bitcast_v16i16_to_v32i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v2 -; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; VI-NEXT: v_mov_b32_e32 v32, v6 -; VI-NEXT: v_mov_b32_e32 v34, v4 -; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v11 -; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v13 -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 -; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v19 -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23 -; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 -; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v2 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB49_3 -; VI-NEXT: ; %bb.1: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB49_4 -; VI-NEXT: .LBB49_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB97_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s23, 24 +; VI-NEXT: s_lshr_b32 s15, s23, 16 +; VI-NEXT: s_lshr_b32 s24, s23, 8 +; VI-NEXT: s_lshr_b32 s25, s22, 16 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: s_lshr_b32 s27, s21, 24 +; VI-NEXT: s_lshr_b32 s28, s21, 16 +; VI-NEXT: s_lshr_b32 s29, s21, 8 +; VI-NEXT: s_lshr_b32 s40, s20, 16 +; VI-NEXT: s_lshr_b32 s41, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s43, s19, 16 +; VI-NEXT: s_lshr_b32 s44, s19, 8 +; VI-NEXT: s_lshr_b32 s45, s18, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: s_lshr_b32 s47, s17, 24 +; VI-NEXT: s_lshr_b32 s56, s17, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 8 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB97_3 +; VI-NEXT: .LBB97_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s7, s23, 3 +; VI-NEXT: s_add_i32 s9, s20, 3 +; VI-NEXT: s_add_i32 s11, s21, 3 +; VI-NEXT: s_and_b32 s12, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s18, 3 +; VI-NEXT: s_and_b32 s14, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s19, 3 +; VI-NEXT: s_and_b32 s18, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s19, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s17, s19, s17 +; VI-NEXT: s_or_b32 s16, s18, s16 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s19, s14, 0x30000 +; VI-NEXT: s_add_i32 s18, s12, 0x30000 +; VI-NEXT: s_add_i32 s21, s10, 0x30000 +; VI-NEXT: s_add_i32 s20, s8, 0x30000 +; VI-NEXT: s_add_i32 s23, s6, 0x30000 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s14, s23, 24 +; VI-NEXT: s_lshr_b32 s15, s23, 16 +; VI-NEXT: s_lshr_b32 s24, s23, 8 +; VI-NEXT: s_lshr_b32 s25, s22, 16 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: s_lshr_b32 s27, s21, 24 +; VI-NEXT: s_lshr_b32 s28, s21, 16 +; VI-NEXT: s_lshr_b32 s29, s21, 8 +; VI-NEXT: s_lshr_b32 s40, s20, 16 +; VI-NEXT: s_lshr_b32 s41, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s43, s19, 16 +; VI-NEXT: s_lshr_b32 s44, s19, 8 +; VI-NEXT: s_lshr_b32 s45, s18, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: s_lshr_b32 s47, s17, 24 +; VI-NEXT: s_lshr_b32 s56, s17, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 8 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: .LBB97_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s59 +; VI-NEXT: v_mov_b32_e32 v2, s58 +; VI-NEXT: v_mov_b32_e32 v3, s10 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s57 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s47 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s46 +; VI-NEXT: v_mov_b32_e32 v10, s45 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s44 +; VI-NEXT: v_mov_b32_e32 v14, s43 +; VI-NEXT: v_mov_b32_e32 v15, s42 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v17, s41 +; VI-NEXT: v_mov_b32_e32 v18, s40 +; VI-NEXT: v_mov_b32_e32 v19, s6 +; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_mov_b32_e32 v21, s29 +; VI-NEXT: v_mov_b32_e32 v22, s28 +; VI-NEXT: v_mov_b32_e32 v23, s27 +; VI-NEXT: v_mov_b32_e32 v24, s22 +; VI-NEXT: v_mov_b32_e32 v25, s26 +; VI-NEXT: v_mov_b32_e32 v26, s25 +; VI-NEXT: v_mov_b32_e32 v27, s4 +; VI-NEXT: v_mov_b32_e32 v28, s23 +; VI-NEXT: v_mov_b32_e32 v29, s24 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v31, s14 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB49_3: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v31, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v32, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v30, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr8 -; VI-NEXT: ; implicit-def: $vgpr10 -; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: .LBB97_4: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB97_2 +; +; GFX9-LABEL: bitcast_v16i16_to_v32i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s23, 24 +; GFX9-NEXT: s_lshr_b32 s15, s23, 16 +; GFX9-NEXT: s_lshr_b32 s25, s23, 8 +; GFX9-NEXT: s_lshr_b32 s24, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: s_lshr_b32 s27, s21, 24 +; GFX9-NEXT: s_lshr_b32 s28, s21, 16 +; GFX9-NEXT: s_lshr_b32 s40, s21, 8 +; GFX9-NEXT: s_lshr_b32 s29, s20, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 8 +; GFX9-NEXT: s_lshr_b32 s42, s19, 24 +; GFX9-NEXT: s_lshr_b32 s43, s19, 16 +; GFX9-NEXT: s_lshr_b32 s45, s19, 8 +; GFX9-NEXT: s_lshr_b32 s44, s18, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: s_lshr_b32 s47, s17, 24 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 8 +; GFX9-NEXT: s_lshr_b32 s57, s16, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB97_4 +; GFX9-NEXT: .LBB97_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; GFX9-NEXT: s_branch .LBB97_5 +; GFX9-NEXT: .LBB97_3: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr24 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB97_2 +; GFX9-NEXT: .LBB97_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v9, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v24, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v35, s59 +; GFX9-NEXT: v_mov_b32_e32 v2, s57 +; GFX9-NEXT: v_mov_b32_e32 v5, s58 +; GFX9-NEXT: v_mov_b32_e32 v6, s56 +; GFX9-NEXT: v_mov_b32_e32 v7, s47 +; GFX9-NEXT: v_mov_b32_e32 v34, s46 +; GFX9-NEXT: v_mov_b32_e32 v10, s44 +; GFX9-NEXT: v_mov_b32_e32 v13, s45 +; GFX9-NEXT: v_mov_b32_e32 v14, s43 +; GFX9-NEXT: v_mov_b32_e32 v15, s42 +; GFX9-NEXT: v_mov_b32_e32 v33, s41 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 +; GFX9-NEXT: v_mov_b32_e32 v21, s40 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 +; GFX9-NEXT: v_mov_b32_e32 v32, s26 +; GFX9-NEXT: v_mov_b32_e32 v26, s24 +; GFX9-NEXT: v_mov_b32_e32 v29, s25 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v31, s14 +; GFX9-NEXT: v_mov_b32_e32 v27, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s8 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB97_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-NEXT: v_mov_b32_e32 v28, v25 +; GFX9-NEXT: v_mov_b32_e32 v1, v35 +; GFX9-NEXT: v_mov_b32_e32 v9, v34 +; GFX9-NEXT: v_mov_b32_e32 v17, v33 +; GFX9-NEXT: v_mov_b32_e32 v25, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v16i16_to_v32i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB97_4 +; GFX11-TRUE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, s19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[32:33], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB97_5 +; GFX11-TRUE16-NEXT: .LBB97_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB97_2 +; GFX11-TRUE16-NEXT: .LBB97_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: .LBB97_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v16i16_to_v32i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB97_4 +; GFX11-FAKE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v39, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v37, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v35, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v33, s19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v34, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v36, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v38, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-FAKE16-NEXT: s_branch .LBB97_5 +; GFX11-FAKE16-NEXT: .LBB97_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: s_branch .LBB97_2 +; GFX11-FAKE16-NEXT: .LBB97_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s0 :: v_dual_mov_b32 v39, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s2 :: v_dual_mov_b32 v37, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s16 :: v_dual_mov_b32 v35, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v33, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s46 :: v_dual_mov_b32 v2, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s45 :: v_dual_mov_b32 v6, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s42 :: v_dual_mov_b32 v10, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s41 :: v_dual_mov_b32 v14, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v18, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v22, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v26, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s25 :: v_dual_mov_b32 v30, s14 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v23, s22 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v25, s21 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v29, s20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, s13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v27, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v19, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB97_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v37 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v34 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, v35 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v24, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + +define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { +; SI-LABEL: bitcast_v32i8_to_v16i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: v_mov_b32_e32 v34, v8 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v37, v2 +; SI-NEXT: v_or_b32_e32 v2, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v19 +; SI-NEXT: v_or_b32_e32 v4, v39, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v7, v48, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v21, v0, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v9, v9, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v8, v50, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v11, v51, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v10, v9, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v29, v0, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v13, v13, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_or_b32_e32 v12, v53, v9 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v15, v27, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; SI-NEXT: v_or_b32_e32 v14, v13, v15 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v8, v0, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 +; SI-NEXT: v_or_b32_e32 v18, v55, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v18, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: .LBB98_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v55, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v30 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v27, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v50, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v0 +; SI-NEXT: v_alignbit_b32 v1, v2, v21, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v29, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: .LBB98_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v21 +; SI-NEXT: v_mov_b32_e32 v4, v29 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i8_to_v16i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v34, v4 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v2 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB98_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB98_4 +; VI-NEXT: .LBB98_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB98_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v31, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v32, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 ; VI-NEXT: ; implicit-def: $vgpr14 ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr18 @@ -16271,8 +33025,8 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB49_2 -; VI-NEXT: .LBB49_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB98_2 +; VI-NEXT: .LBB98_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v30 ; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v1, 0x300 @@ -16364,14 +33118,14 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: s_cbranch_execnz .LBB98_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB49_4 -; GFX9-NEXT: .LBB49_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB98_4 +; GFX9-NEXT: .LBB98_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB49_3: ; %cmp.false +; GFX9-NEXT: .LBB98_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 @@ -16430,8 +33184,8 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB49_2 -; GFX9-NEXT: .LBB49_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB98_2 +; GFX9-NEXT: .LBB98_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 ; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 @@ -16529,14 +33283,14 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_4 -; GFX11-TRUE16-NEXT: .LBB49_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_4 +; GFX11-TRUE16-NEXT: .LBB98_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB49_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.h @@ -16602,8 +33356,8 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-TRUE16-NEXT: .LBB49_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v28.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v24.l, 3 @@ -16703,14 +33457,14 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB98_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_4 -; GFX11-FAKE16-NEXT: .LBB49_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB98_4 +; GFX11-FAKE16-NEXT: .LBB98_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB49_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v33 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 @@ -16784,8 +33538,8 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-FAKE16-NEXT: .LBB49_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-FAKE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v28, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v30, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v24, 3 @@ -16877,154 +33631,1096 @@ end: ret <16 x i16> %phi } +define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i8_to_v16i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v19, v14 +; SI-NEXT: v_mov_b32_e32 v20, v12 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v13 +; SI-NEXT: s_cbranch_scc0 .LBB99_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s8, s19, 24 +; SI-NEXT: s_or_b32 s4, s8, s4 +; SI-NEXT: s_and_b32 s8, s28, 0xff +; SI-NEXT: s_lshl_b32 s12, s29, 8 +; SI-NEXT: s_or_b32 s8, s8, s12 +; SI-NEXT: s_and_b32 s12, s6, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s15, s7, 24 +; SI-NEXT: s_or_b32 s41, s15, s12 +; SI-NEXT: s_and_b32 s12, s26, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s15, s27, 24 +; SI-NEXT: s_or_b32 s12, s15, s12 +; SI-NEXT: s_and_b32 s15, s16, 0xff +; SI-NEXT: s_lshl_b32 s40, s17, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v8 +; SI-NEXT: s_or_b32 s15, s15, s40 +; SI-NEXT: v_or_b32_e32 v9, v9, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v11, v0, v10 +; SI-NEXT: s_or_b32 s15, s15, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s40, s25, 8 +; SI-NEXT: v_or_b32_e32 v10, v9, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 +; SI-NEXT: s_or_b32 s4, s4, s40 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v5, s12 +; SI-NEXT: v_or_b32_e32 v12, v3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v19 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v16 +; SI-NEXT: s_or_b32 s12, s4, s12 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s40, s9, 8 +; SI-NEXT: v_or_b32_e32 v9, v9, v21 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_or_b32 s4, s4, s40 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v15, v7, v13 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v14, v9, v15 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v18, s4, v12 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s40, s13, 8 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_or_b32 s4, s4, s40 +; SI-NEXT: s_or_b32 s8, s8, s41 +; SI-NEXT: v_or_b32_e32 v22, v17, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_alignbit_b32 v1, s11, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, s8, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v22, 16 +; SI-NEXT: v_or_b32_e32 v12, s4, v22 +; SI-NEXT: s_lshr_b32 s40, s5, 16 +; SI-NEXT: s_lshr_b32 s41, s41, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_cbranch_execnz .LBB99_3 +; SI-NEXT: .LBB99_2: ; %cmp.true +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s5, s13, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v1 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s9, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s12, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s7, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s8, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: s_add_i32 s15, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: s_add_i32 s11, s4, 0x3000000 +; SI-NEXT: v_mov_b32_e32 v0, s15 +; SI-NEXT: v_alignbit_b32 v1, s11, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_alignbit_b32 v5, s8, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v18, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: s_lshr_b32 s40, s11, 16 +; SI-NEXT: s_lshr_b32 s41, s8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: .LBB99_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s15 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v3, s40 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v7, s41 +; SI-NEXT: v_mov_b32_e32 v8, v18 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB99_4: +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB99_2 +; +; VI-LABEL: bitcast_v32i8_to_v16i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v24, v6 +; VI-NEXT: v_mov_b32_e32 v20, v5 +; VI-NEXT: v_mov_b32_e32 v23, v4 +; VI-NEXT: v_mov_b32_e32 v19, v2 +; VI-NEXT: v_mov_b32_e32 v21, v1 +; VI-NEXT: v_mov_b32_e32 v22, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: s_cbranch_scc0 .LBB99_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v20 +; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 +; VI-NEXT: v_or_b32_sdwa v2, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB99_3 +; VI-NEXT: .LBB99_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v22 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v23 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s10, s29, 8 +; VI-NEXT: s_and_b32 s11, s28, 0xff +; VI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s25, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s24, 0xff +; VI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s11, s20, 0xff +; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; VI-NEXT: v_or_b32_e32 v3, v3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v9 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v11, v25, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s8, s11 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v11 +; VI-NEXT: s_or_b32 s7, s7, s11 +; VI-NEXT: s_and_b32 s13, s18, 0xff +; VI-NEXT: v_or_b32_e32 v2, v2, v5 +; VI-NEXT: s_lshl_b32 s6, s19, 24 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_and_b32 s12, s22, 0xff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 +; VI-NEXT: s_and_b32 s11, s26, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s8, 0xffff +; VI-NEXT: s_lshl_b32 s8, s12, 16 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s8, s11, 16 +; VI-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v17 +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v12, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s9, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v12 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s10, 0xffff +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s7, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: .LBB99_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB99_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_branch .LBB99_2 +; +; GFX9-LABEL: bitcast_v32i8_to_v16i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v22, v4 +; GFX9-NEXT: v_mov_b32_e32 v21, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_or_b32_sdwa v4, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB99_3 +; GFX9-NEXT: .LBB99_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v21 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v22 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v19 +; GFX9-NEXT: s_or_b32 s10, s11, s10 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: .LBB99_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB99_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_branch .LBB99_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v18 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v14, 16, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB99_3 +; GFX11-TRUE16-NEXT: .LBB99_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v6, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB99_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB99_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-TRUE16-NEXT: s_branch .LBB99_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v16 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v7, 16, v23 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB99_3 +; GFX11-FAKE16-NEXT: .LBB99_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v16 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v17 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v13, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v14, v6 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v7, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB99_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB99_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-FAKE16-NEXT: s_branch .LBB99_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f16_to_v16bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v15 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB50_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB50_4 -; GCN-NEXT: .LBB50_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB50_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v31 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB50_2 -; GCN-NEXT: .LBB50_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v16 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v23 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f16_to_v16bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB100_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB100_4 +; SI-NEXT: .LBB100_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB100_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_2 +; SI-NEXT: .LBB100_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v16bf16: ; VI: ; %bb.0: @@ -17033,7 +34729,7 @@ define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: s_cbranch_execz .LBB100_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v9, 0x200 ; VI-NEXT: v_add_f16_e32 v8, 0x200, v0 @@ -17060,7 +34756,7 @@ define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v11, v2 ; VI-NEXT: v_or_b32_e32 v1, v10, v1 ; VI-NEXT: v_or_b32_e32 v0, v8, v0 -; VI-NEXT: .LBB50_2: ; %end +; VI-NEXT: .LBB100_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -17093,7 +34789,7 @@ define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-NEXT: s_cbranch_execz .LBB100_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] @@ -17103,7 +34799,7 @@ define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB50_2: ; %end +; GFX11-NEXT: .LBB100_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -17123,170 +34819,439 @@ end: ret <16 x bfloat> %phi } +define inreg <16 x bfloat> @bitcast_v16f16_to_v16bf16_scalar(<16 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f16_to_v16bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: .LBB101_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: .LBB101_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB101_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB101_2 +; +; VI-LABEL: bitcast_v16f16_to_v16bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB101_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB101_4 +; VI-NEXT: .LBB101_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: s_lshr_b32 s5, s22, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: s_lshr_b32 s5, s23, 16 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_add_f16_e32 v5, s22, v0 +; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v13, s5 +; VI-NEXT: v_add_f16_e32 v7, s23, v0 +; VI-NEXT: v_add_f16_sdwa v13, v13, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v5, v6 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v7, v7, v13 +; VI-NEXT: v_add_f16_sdwa v13, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_add_f16_e32 v8, s16, v0 +; VI-NEXT: v_add_f16_sdwa v9, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v10, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v11, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v12, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: v_add_f16_sdwa v5, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s21, v0 +; VI-NEXT: v_or_b32_e32 v5, v0, v5 +; VI-NEXT: v_or_b32_e32 v4, v4, v13 +; VI-NEXT: v_or_b32_e32 v3, v3, v12 +; VI-NEXT: v_or_b32_e32 v2, v2, v11 +; VI-NEXT: v_or_b32_e32 v1, v1, v10 +; VI-NEXT: v_or_b32_e32 v0, v8, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB101_3: +; VI-NEXT: s_branch .LBB101_2 +; VI-NEXT: .LBB101_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v16bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB101_4 +; GFX9-NEXT: .LBB101_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB101_3: +; GFX9-NEXT: s_branch .LBB101_2 +; GFX9-NEXT: .LBB101_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v16bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB101_4 +; GFX11-NEXT: .LBB101_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB101_3: +; GFX11-NEXT: s_branch .LBB101_2 +; GFX11-NEXT: .LBB101_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v16bf16_to_v16f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v15 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB51_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB51_4 -; GCN-NEXT: .LBB51_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB51_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB51_2 -; GCN-NEXT: .LBB51_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16bf16_to_v16f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v15 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB102_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB102_4 +; SI-NEXT: .LBB102_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB102_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB102_2 +; SI-NEXT: .LBB102_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16bf16_to_v16f16: ; VI: ; %bb.0: @@ -17295,7 +35260,7 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB51_2 +; VI-NEXT: s_cbranch_execz .LBB102_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -17442,7 +35407,7 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v2, v2, v10, 16 ; VI-NEXT: v_alignbit_b32 v1, v1, v9, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; VI-NEXT: .LBB51_2: ; %end +; VI-NEXT: .LBB102_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -17453,7 +35418,7 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB51_2 +; GFX9-NEXT: s_cbranch_execz .LBB102_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -17577,7 +35542,7 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v2, v10, s6 ; GFX9-NEXT: v_perm_b32 v1, v1, v9, s6 ; GFX9-NEXT: v_perm_b32 v0, v0, v8, s6 -; GFX9-NEXT: .LBB51_2: ; %end +; GFX9-NEXT: .LBB102_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -17589,7 +35554,7 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v6 @@ -17733,7 +35698,7 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v13, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v11 -; GFX11-TRUE16-NEXT: .LBB51_2: ; %end +; GFX11-TRUE16-NEXT: .LBB102_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17745,7 +35710,7 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -17880,7 +35845,7 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v13, v16, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v11, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB51_2: ; %end +; GFX11-FAKE16-NEXT: .LBB102_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -17900,217 +35865,906 @@ end: ret <16 x half> %phi } -define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f16_to_v32i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v14 -; GCN-NEXT: v_mov_b32_e32 v18, v6 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v17 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB52_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB52_4 -; GCN-NEXT: .LBB52_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB52_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v30 -; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 -; GCN-NEXT: v_bfe_u32 v15, v14, 8, 8 -; GCN-NEXT: v_bfe_u32 v23, v22, 8, 8 -; GCN-NEXT: v_or_b32_e32 v0, v33, v0 -; GCN-NEXT: v_or_b32_e32 v4, v32, v1 -; GCN-NEXT: v_or_b32_e32 v8, v35, v2 -; GCN-NEXT: v_or_b32_e32 v12, v34, v3 -; GCN-NEXT: v_or_b32_e32 v16, v38, v5 -; GCN-NEXT: v_or_b32_e32 v20, v36, v9 -; GCN-NEXT: v_or_b32_e32 v24, v49, v10 -; GCN-NEXT: v_or_b32_e32 v28, v48, v11 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_bfe_u32 v31, v30, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB52_2 -; GCN-NEXT: .LBB52_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v32 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v6 -; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 -; GCN-NEXT: v_bfe_u32 v15, v14, 8, 8 -; GCN-NEXT: v_bfe_u32 v23, v22, 8, 8 -; GCN-NEXT: v_or_b32_e32 v24, v1, v0 -; GCN-NEXT: v_or_b32_e32 v28, v2, v12 -; GCN-NEXT: v_or_b32_e32 v16, v4, v3 -; GCN-NEXT: v_or_b32_e32 v20, v5, v17 -; GCN-NEXT: v_or_b32_e32 v8, v8, v18 -; GCN-NEXT: v_or_b32_e32 v12, v9, v19 -; GCN-NEXT: v_or_b32_e32 v0, v11, v10 -; GCN-NEXT: v_or_b32_e32 v4, v13, v21 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_bfe_u32 v31, v30, 8, 8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16bf16_to_v16f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v1 +; SI-NEXT: s_cbranch_scc0 .LBB103_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_cbranch_execnz .LBB103_3 +; SI-NEXT: .LBB103_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: .LBB103_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB103_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB103_2 ; -; VI-LABEL: bitcast_v16f16_to_v32i8: +; VI-LABEL: bitcast_v16bf16_to_v16f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v5 -; VI-NEXT: v_mov_b32_e32 v34, v4 -; VI-NEXT: v_mov_b32_e32 v33, v3 -; VI-NEXT: v_mov_b32_e32 v32, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB103_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB103_4 +; VI-NEXT: .LBB103_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v1 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v4, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v1 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_f32_e32 v6, s4, v1 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: s_lshl_b32 s5, s22, 16 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v6, s5, v1 +; VI-NEXT: v_cndmask_b32_e32 v11, v7, v11, vcc +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc +; VI-NEXT: v_add_f32_e32 v7, s5, v1 +; VI-NEXT: v_bfe_u32 v12, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v7 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc +; VI-NEXT: s_lshl_b32 s5, s23, 16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; VI-NEXT: v_add_f32_e32 v7, s5, v1 +; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc +; VI-NEXT: v_add_f32_e32 v13, s5, v1 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v6, v12, v6, 16 +; VI-NEXT: v_add_f32_e32 v12, s4, v1 +; VI-NEXT: v_alignbit_b32 v7, v13, v7, 16 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; VI-NEXT: v_add_f32_e32 v13, s4, v1 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v5, v1, v13, 16 +; VI-NEXT: v_alignbit_b32 v4, v4, v11, 16 +; VI-NEXT: v_alignbit_b32 v3, v3, v10, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v9, 16 +; VI-NEXT: v_alignbit_b32 v1, v15, v8, 16 +; VI-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB103_3: +; VI-NEXT: s_branch .LBB103_2 +; VI-NEXT: .LBB103_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v16f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB103_4 +; GFX9-NEXT: .LBB103_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v5, s5, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: s_lshl_b32 s5, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_add_f32_e32 v6, s5, v1 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: s_and_b32 s5, s23, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc +; GFX9-NEXT: v_add_f32_e32 v7, s5, v1 +; GFX9-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v12, v12, v7 +; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: s_lshl_b32 s5, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc +; GFX9-NEXT: v_add_f32_e32 v12, s5, v1 +; GFX9-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v13, v13, v12 +; GFX9-NEXT: v_add_u32_e32 v13, 0x7fff, v13 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; GFX9-NEXT: v_mov_b32_e32 v13, 0xffff +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_b32_sdwa v6, v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_and_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v5, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 +; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v12 +; GFX9-NEXT: v_bfe_u32 v12, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v12, v12, v5 +; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v12, v14, vcc +; GFX9-NEXT: v_add_f32_e32 v12, s4, v1 +; GFX9-NEXT: v_bfe_u32 v14, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v14, v14, v12 +; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v12, s4, v1 +; GFX9-NEXT: v_bfe_u32 v14, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v14, v14, v12 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc +; GFX9-NEXT: v_bfe_u32 v14, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v14, v14, v1 +; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v15, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; GFX9-NEXT: v_and_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v10 +; GFX9-NEXT: v_and_b32_sdwa v10, v13, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v9, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v8, v13, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v0, v16, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB103_3: +; GFX9-NEXT: s_branch .LBB103_2 +; GFX9-NEXT: .LBB103_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v16f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB103_4 +; GFX11-NEXT: .LBB103_2: ; %cmp.true +; GFX11-NEXT: s_and_b32 s8, s0, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s1, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-NEXT: s_and_b32 s8, s2, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s4, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s8 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v9, v5 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v6, v7, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s3, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v4 :: v_dual_add_nc_u32 v4, v6, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_lshl_b32 s0, s3, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s4, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v9 :: v_dual_add_nc_u32 v5, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_and_b32 s0, s5, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v7 +; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_lshl_b32 s0, s5, 16 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX11-NEXT: v_bfe_u32 v5, v12, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_bfe_u32 v13, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v12 +; GFX11-NEXT: s_and_b32 s0, s6, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v14, vcc_lo +; GFX11-NEXT: v_bfe_u32 v14, v15, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s6, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v6 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v10 :: v_dual_add_nc_u32 v10, v14, v15 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s7, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v18, v14, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: s_and_b32 s0, s7, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v13, v16, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v15 +; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v15, v18, v14 +; GFX11-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v18, v20, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v10, v10, v16 :: v_dual_add_nc_u32 v15, 0x7fff, v15 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v12 +; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v14, v15, v21, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v22, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v16 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshl_or_b32 v6, v12, 16, v14 +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v16 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v7, v13, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v3 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v4, v17, 16, v10 +; GFX11-NEXT: v_lshl_or_b32 v3, v9, 16, v12 +; GFX11-NEXT: v_lshl_or_b32 v2, v11, 16, v2 +; GFX11-NEXT: v_lshl_or_b32 v1, v8, 16, v13 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v14 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB103_3: +; GFX11-NEXT: s_branch .LBB103_2 +; GFX11-NEXT: .LBB103_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + +define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v16f16_to_v32i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_mov_b32_e32 v18, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB104_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB104_4 +; SI-NEXT: .LBB104_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB104_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_or_b32_e32 v8, v36, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_or_b32_e32 v12, v35, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_or_b32_e32 v16, v39, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_or_b32_e32 v20, v38, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v24, v50, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v30 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v4, v32, v1 +; SI-NEXT: v_or_b32_e32 v28, v49, v5 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 +; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB104_2 +; SI-NEXT: .LBB104_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_or_b32_e32 v24, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_or_b32_e32 v28, v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v16, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v20, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v8, v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v12, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v2, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 +; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f16_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v35, v5 +; VI-NEXT: v_mov_b32_e32 v34, v4 +; VI-NEXT: v_mov_b32_e32 v33, v3 +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v33 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v32 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 @@ -18133,7 +36787,7 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB52_2 +; VI-NEXT: s_cbranch_execz .LBB104_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 @@ -18151,9 +36805,9 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[32:33] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: .LBB52_2: ; %Flow +; VI-NEXT: .LBB104_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB52_4 +; VI-NEXT: s_cbranch_execz .LBB104_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 0x200 ; VI-NEXT: v_add_f16_sdwa v14, v33, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -18204,7 +36858,7 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; VI-NEXT: v_bfe_u32 v23, v22, 8, 8 ; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; VI-NEXT: v_bfe_u32 v37, v36, 8, 8 -; VI-NEXT: .LBB52_4: ; %end +; VI-NEXT: .LBB104_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v1 ; VI-NEXT: v_mov_b32_e32 v8, v32 @@ -18252,7 +36906,7 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB52_2 +; GFX9-NEXT: s_cbranch_execz .LBB104_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -18278,9 +36932,9 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: .LBB52_2: ; %Flow +; GFX9-NEXT: .LBB104_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB52_4 +; GFX9-NEXT: s_cbranch_execz .LBB104_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] @@ -18315,7 +36969,7 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: .LBB52_4: ; %end +; GFX9-NEXT: .LBB104_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 @@ -18355,7 +37009,7 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB104_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[32:33] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[26:27] @@ -18373,9 +37027,9 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[18:19] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB52_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB104_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB104_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v33, 0x200, v33 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] @@ -18401,7 +37055,7 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB52_4: ; %end +; GFX11-TRUE16-NEXT: .LBB104_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -18459,7 +37113,7 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB104_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 @@ -18485,9 +37139,9 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB104_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB104_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v39, 0x200, v39 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v37, 0x200, v37 op_sel_hi:[0,1] @@ -18521,7 +37175,7 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB52_4: ; %end +; GFX11-FAKE16-NEXT: .LBB104_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 @@ -18549,226 +37203,977 @@ end: ret <32 x i8> %phi } +define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f16_to_v32i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v34, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB105_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_or_b32_e32 v8, v36, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_or_b32_e32 v12, v35, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_or_b32_e32 v16, v39, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_or_b32_e32 v20, v38, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v24, v50, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v30 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v4, v32, v1 +; SI-NEXT: v_or_b32_e32 v28, v49, v5 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 +; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 +; SI-NEXT: s_cbranch_execnz .LBB105_3 +; SI-NEXT: .LBB105_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_or_b32_e32 v24, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_or_b32_e32 v28, v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v16, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v20, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v8, v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v12, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v2, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 +; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 +; SI-NEXT: .LBB105_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB105_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB105_2 +; +; VI-LABEL: bitcast_v16f16_to_v32i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB105_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s40, s23, 24 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s25, s23, 8 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s29, s22, 8 +; VI-NEXT: s_lshr_b32 s41, s21, 24 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s24, s21, 8 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s28, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s15, s19, 8 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s27, s18, 8 +; VI-NEXT: s_lshr_b32 s43, s17, 24 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s14, s17, 8 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_lshr_b32 s26, s16, 8 +; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB105_4 +; VI-NEXT: .LBB105_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x200 +; VI-NEXT: v_add_f16_e32 v6, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_add_f16_e32 v2, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_add_f16_e32 v14, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; VI-NEXT: v_add_f16_e32 v34, s19, v1 +; VI-NEXT: v_add_f16_e32 v10, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v12, v34, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; VI-NEXT: v_add_f16_e32 v8, s18, v1 +; VI-NEXT: v_add_f16_e32 v22, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v11, v8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; VI-NEXT: v_add_f16_e32 v33, s21, v1 +; VI-NEXT: v_add_f16_e32 v18, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v20, v33, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; VI-NEXT: v_add_f16_e32 v16, s20, v1 +; VI-NEXT: v_add_f16_e32 v30, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; VI-NEXT: v_add_f16_e32 v35, s17, v1 +; VI-NEXT: v_or_b32_e32 v19, v16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v30 +; VI-NEXT: v_add_f16_e32 v32, s23, v1 +; VI-NEXT: v_add_f16_e32 v26, s4, v1 +; VI-NEXT: v_or_b32_e32 v4, v35, v0 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f16_e32 v0, s16, v1 +; VI-NEXT: v_or_b32_e32 v37, v32, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; VI-NEXT: v_add_f16_e32 v24, s22, v1 +; VI-NEXT: v_or_b32_e32 v3, v0, v3 +; VI-NEXT: v_or_b32_e32 v36, v24, v5 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[36:37] +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[19:20] +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v36 +; VI-NEXT: v_bfe_u32 v31, v30, 8, 8 +; VI-NEXT: v_bfe_u32 v23, v22, 8, 8 +; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; VI-NEXT: s_branch .LBB105_5 +; VI-NEXT: .LBB105_3: +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: s_branch .LBB105_2 +; VI-NEXT: .LBB105_4: +; VI-NEXT: v_mov_b32_e32 v2, s59 +; VI-NEXT: v_mov_b32_e32 v6, s58 +; VI-NEXT: v_mov_b32_e32 v10, s57 +; VI-NEXT: v_mov_b32_e32 v14, s56 +; VI-NEXT: v_mov_b32_e32 v18, s47 +; VI-NEXT: v_mov_b32_e32 v22, s46 +; VI-NEXT: v_mov_b32_e32 v26, s45 +; VI-NEXT: v_mov_b32_e32 v30, s44 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v35, s17 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v34, s19 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v33, s21 +; VI-NEXT: v_mov_b32_e32 v24, s22 +; VI-NEXT: v_mov_b32_e32 v32, s23 +; VI-NEXT: v_mov_b32_e32 v31, s40 +; VI-NEXT: v_mov_b32_e32 v23, s41 +; VI-NEXT: v_mov_b32_e32 v15, s42 +; VI-NEXT: v_mov_b32_e32 v7, s43 +; VI-NEXT: v_mov_b32_e32 v25, s29 +; VI-NEXT: v_mov_b32_e32 v29, s25 +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_mov_b32_e32 v21, s24 +; VI-NEXT: v_mov_b32_e32 v9, s27 +; VI-NEXT: v_mov_b32_e32 v13, s15 +; VI-NEXT: v_mov_b32_e32 v1, s26 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v27, s10 +; VI-NEXT: v_mov_b32_e32 v19, s8 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB105_5: ; %end +; VI-NEXT: v_mov_b32_e32 v4, v35 +; VI-NEXT: v_mov_b32_e32 v12, v34 +; VI-NEXT: v_mov_b32_e32 v20, v33 +; VI-NEXT: v_mov_b32_e32 v28, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v32i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s23, 24 +; GFX9-NEXT: s_lshr_b32 s15, s23, 16 +; GFX9-NEXT: s_lshr_b32 s25, s23, 8 +; GFX9-NEXT: s_lshr_b32 s24, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: s_lshr_b32 s27, s21, 24 +; GFX9-NEXT: s_lshr_b32 s28, s21, 16 +; GFX9-NEXT: s_lshr_b32 s40, s21, 8 +; GFX9-NEXT: s_lshr_b32 s29, s20, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 8 +; GFX9-NEXT: s_lshr_b32 s42, s19, 24 +; GFX9-NEXT: s_lshr_b32 s43, s19, 16 +; GFX9-NEXT: s_lshr_b32 s45, s19, 8 +; GFX9-NEXT: s_lshr_b32 s44, s18, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: s_lshr_b32 s47, s17, 24 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 8 +; GFX9-NEXT: s_lshr_b32 s57, s16, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB105_4 +; GFX9-NEXT: .LBB105_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v2, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v2 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v2 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s19, v2 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s18, v2 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, s21, v2 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, s20, v2 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, s23, v2 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, s22, v2 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; GFX9-NEXT: s_branch .LBB105_5 +; GFX9-NEXT: .LBB105_3: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr24 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB105_2 +; GFX9-NEXT: .LBB105_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v9, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v24, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v35, s59 +; GFX9-NEXT: v_mov_b32_e32 v2, s57 +; GFX9-NEXT: v_mov_b32_e32 v5, s58 +; GFX9-NEXT: v_mov_b32_e32 v6, s56 +; GFX9-NEXT: v_mov_b32_e32 v7, s47 +; GFX9-NEXT: v_mov_b32_e32 v34, s46 +; GFX9-NEXT: v_mov_b32_e32 v10, s44 +; GFX9-NEXT: v_mov_b32_e32 v13, s45 +; GFX9-NEXT: v_mov_b32_e32 v14, s43 +; GFX9-NEXT: v_mov_b32_e32 v15, s42 +; GFX9-NEXT: v_mov_b32_e32 v33, s41 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 +; GFX9-NEXT: v_mov_b32_e32 v21, s40 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 +; GFX9-NEXT: v_mov_b32_e32 v32, s26 +; GFX9-NEXT: v_mov_b32_e32 v26, s24 +; GFX9-NEXT: v_mov_b32_e32 v29, s25 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v31, s14 +; GFX9-NEXT: v_mov_b32_e32 v27, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s8 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB105_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-NEXT: v_mov_b32_e32 v28, v25 +; GFX9-NEXT: v_mov_b32_e32 v1, v35 +; GFX9-NEXT: v_mov_b32_e32 v9, v34 +; GFX9-NEXT: v_mov_b32_e32 v17, v33 +; GFX9-NEXT: v_mov_b32_e32 v25, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v16f16_to_v32i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-TRUE16-NEXT: .LBB105_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, s19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[32:33], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB105_5 +; GFX11-TRUE16-NEXT: .LBB105_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB105_2 +; GFX11-TRUE16-NEXT: .LBB105_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: .LBB105_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v16f16_to_v32i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-FAKE16-NEXT: .LBB105_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v39, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v37, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v35, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v33, 0x200, s19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v34, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v36, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v38, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-FAKE16-NEXT: s_branch .LBB105_5 +; GFX11-FAKE16-NEXT: .LBB105_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: s_branch .LBB105_2 +; GFX11-FAKE16-NEXT: .LBB105_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s0 :: v_dual_mov_b32 v39, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s2 :: v_dual_mov_b32 v37, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s16 :: v_dual_mov_b32 v35, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v33, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s46 :: v_dual_mov_b32 v2, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s45 :: v_dual_mov_b32 v6, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s42 :: v_dual_mov_b32 v10, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s41 :: v_dual_mov_b32 v14, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v18, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v22, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v26, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s25 :: v_dual_mov_b32 v30, s14 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v23, s22 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v25, s21 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v29, s20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, s13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v27, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v19, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB105_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v37 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v34 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, v35 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v24, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i8_to_v16f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v0, v0, v32 -; GCN-NEXT: v_or_b32_e32 v1, v1, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v34 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_or_b32_e32 v4, v4, v36 -; GCN-NEXT: v_or_b32_e32 v5, v5, v37 -; GCN-NEXT: v_or_b32_e32 v6, v6, v38 -; GCN-NEXT: v_or_b32_e32 v7, v7, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v48 -; GCN-NEXT: v_or_b32_e32 v9, v9, v49 -; GCN-NEXT: v_or_b32_e32 v10, v10, v50 -; GCN-NEXT: v_or_b32_e32 v11, v11, v51 -; GCN-NEXT: v_or_b32_e32 v12, v12, v52 -; GCN-NEXT: v_or_b32_e32 v13, v13, v53 -; GCN-NEXT: v_or_b32_e32 v14, v14, v54 -; GCN-NEXT: v_or_b32_e32 v15, v15, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB53_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v55, v1 -; GCN-NEXT: v_or_b32_e32 v3, v54, v3 -; GCN-NEXT: v_or_b32_e32 v5, v53, v5 -; GCN-NEXT: v_or_b32_e32 v7, v52, v7 -; GCN-NEXT: v_or_b32_e32 v9, v51, v9 -; GCN-NEXT: v_or_b32_e32 v11, v50, v11 -; GCN-NEXT: v_or_b32_e32 v13, v49, v13 -; GCN-NEXT: v_or_b32_e32 v15, v48, v15 -; GCN-NEXT: v_or_b32_e32 v14, v39, v14 -; GCN-NEXT: v_or_b32_e32 v12, v38, v12 -; GCN-NEXT: v_or_b32_e32 v10, v37, v10 -; GCN-NEXT: v_or_b32_e32 v8, v36, v8 -; GCN-NEXT: v_or_b32_e32 v6, v35, v6 -; GCN-NEXT: v_or_b32_e32 v4, v34, v4 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v0, v32, v0 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 0x300, v1 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v16 -; GCN-NEXT: .LBB53_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v31 -; GCN-NEXT: v_mov_b32_e32 v2, v23 -; GCN-NEXT: v_mov_b32_e32 v4, v19 -; GCN-NEXT: v_mov_b32_e32 v6, v27 -; GCN-NEXT: v_mov_b32_e32 v8, v17 -; GCN-NEXT: v_mov_b32_e32 v10, v21 -; GCN-NEXT: v_mov_b32_e32 v12, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v29 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i8_to_v16f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v34, v4 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v29 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v8, v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v8, v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v8, v8, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_or_b32_e32 v5, v5, v48 +; SI-NEXT: v_or_b32_e32 v6, v6, v49 +; SI-NEXT: v_or_b32_e32 v7, v7, v50 +; SI-NEXT: v_or_b32_e32 v8, v8, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB106_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: .LBB106_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, v17 +; SI-NEXT: v_mov_b32_e32 v10, v21 +; SI-NEXT: v_mov_b32_e32 v12, v25 +; SI-NEXT: v_mov_b32_e32 v14, v29 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v16f16: ; VI: ; %bb.0: @@ -18801,14 +38206,14 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: s_cbranch_execnz .LBB106_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB53_4 -; VI-NEXT: .LBB53_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB106_4 +; VI-NEXT: .LBB106_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB53_3: ; %cmp.false +; VI-NEXT: .LBB106_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -18866,8 +38271,8 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB53_2 -; VI-NEXT: .LBB53_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB106_2 +; VI-NEXT: .LBB106_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v30 ; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v1, 0x300 @@ -18959,14 +38364,14 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: s_cbranch_execnz .LBB106_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB53_4 -; GFX9-NEXT: .LBB53_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB106_4 +; GFX9-NEXT: .LBB106_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB53_3: ; %cmp.false +; GFX9-NEXT: .LBB106_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 @@ -19025,8 +38430,8 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB53_2 -; GFX9-NEXT: .LBB53_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB106_2 +; GFX9-NEXT: .LBB106_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 ; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 @@ -19124,14 +38529,14 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_4 -; GFX11-TRUE16-NEXT: .LBB53_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_4 +; GFX11-TRUE16-NEXT: .LBB106_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB53_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.h @@ -19197,8 +38602,8 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB53_2 -; GFX11-TRUE16-NEXT: .LBB53_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 +; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v28.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v24.l, 3 @@ -19298,14 +38703,14 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB106_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_4 -; GFX11-FAKE16-NEXT: .LBB53_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB106_4 +; GFX11-FAKE16-NEXT: .LBB106_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB53_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v33 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 @@ -19379,8 +38784,8 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB53_2 -; GFX11-FAKE16-NEXT: .LBB53_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB106_2 +; GFX11-FAKE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v28, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v30, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v24, 3 @@ -19472,258 +38877,1173 @@ end: ret <16 x half> %phi } -define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v16bf16_to_v32i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB54_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB54_4 -; GCN-NEXT: .LBB54_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB54_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v53 -; GCN-NEXT: v_alignbit_b32 v0, v0, v33, 16 -; GCN-NEXT: v_alignbit_b32 v4, v6, v32, 16 -; GCN-NEXT: v_alignbit_b32 v8, v1, v36, 16 -; GCN-NEXT: v_alignbit_b32 v12, v14, v35, 16 -; GCN-NEXT: v_alignbit_b32 v16, v2, v48, 16 -; GCN-NEXT: v_alignbit_b32 v20, v22, v39, 16 -; GCN-NEXT: v_alignbit_b32 v24, v3, v52, 16 -; GCN-NEXT: v_alignbit_b32 v28, v30, v51, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB54_2 -; GCN-NEXT: .LBB54_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v34 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v16 -; GCN-NEXT: v_alignbit_b32 v24, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v28, v30, v2, 16 -; GCN-NEXT: v_alignbit_b32 v16, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v20, v22, v5, 16 -; GCN-NEXT: v_alignbit_b32 v8, v17, v8, 16 -; GCN-NEXT: v_alignbit_b32 v12, v14, v9, 16 -; GCN-NEXT: v_alignbit_b32 v0, v18, v11, 16 -; GCN-NEXT: v_alignbit_b32 v4, v6, v13, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v31 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i8_to_v16f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_readfirstlane_b32 s46, v17 +; SI-NEXT: v_readfirstlane_b32 s47, v16 +; SI-NEXT: v_readfirstlane_b32 s44, v15 +; SI-NEXT: v_readfirstlane_b32 s45, v14 +; SI-NEXT: v_readfirstlane_b32 s42, v13 +; SI-NEXT: v_readfirstlane_b32 s43, v12 +; SI-NEXT: v_readfirstlane_b32 s40, v11 +; SI-NEXT: v_readfirstlane_b32 s41, v10 +; SI-NEXT: v_readfirstlane_b32 s14, v9 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v7 +; SI-NEXT: v_readfirstlane_b32 s13, v6 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_readfirstlane_b32 s11, v4 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: v_readfirstlane_b32 s9, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: s_cbranch_scc0 .LBB107_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s12, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: s_lshl_b32 s5, s14, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_cbranch_execnz .LBB107_3 +; SI-NEXT: .LBB107_2: ; %cmp.true +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s28, 0xff +; SI-NEXT: s_lshl_b32 s9, s29, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xff +; SI-NEXT: s_lshl_b32 s11, s27, 8 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_and_b32 s15, s15, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s11, s24, 0xff +; SI-NEXT: s_lshl_b32 s13, s25, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_or_b32 s11, s13, s11 +; SI-NEXT: s_and_b32 s13, s22, 0xff +; SI-NEXT: s_lshl_b32 s15, s23, 8 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s45, 0xff +; SI-NEXT: s_lshl_b32 s44, s44, 8 +; SI-NEXT: s_and_b32 s43, s43, 0xff +; SI-NEXT: s_lshl_b32 s42, s42, 8 +; SI-NEXT: s_and_b32 s41, s41, 0xff +; SI-NEXT: s_lshl_b32 s40, s40, 8 +; SI-NEXT: s_or_b32 s13, s15, s13 +; SI-NEXT: s_and_b32 s15, s20, 0xff +; SI-NEXT: s_lshl_b32 s20, s21, 8 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s5, s44, s5 +; SI-NEXT: s_or_b32 s42, s42, s43 +; SI-NEXT: s_or_b32 s40, s40, s41 +; SI-NEXT: s_or_b32 s15, s20, s15 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_addk_i32 s42, 0x300 +; SI-NEXT: s_addk_i32 s40, 0x300 +; SI-NEXT: s_addk_i32 s14, 0x300 +; SI-NEXT: s_addk_i32 s12, 0x300 +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_addk_i32 s15, 0x300 +; SI-NEXT: s_addk_i32 s18, 0x300 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: .LBB107_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB107_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB107_2 ; -; VI-LABEL: bitcast_v16bf16_to_v32i8: +; VI-LABEL: bitcast_v32i8_to_v16f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v34, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr2 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr10 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr14 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB54_2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v24, v6 +; VI-NEXT: v_mov_b32_e32 v20, v5 +; VI-NEXT: v_mov_b32_e32 v23, v4 +; VI-NEXT: v_mov_b32_e32 v19, v2 +; VI-NEXT: v_mov_b32_e32 v21, v1 +; VI-NEXT: v_mov_b32_e32 v22, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: s_cbranch_scc0 .LBB107_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: .LBB54_2: ; %Flow +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v20 +; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 +; VI-NEXT: v_or_b32_sdwa v2, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB107_3 +; VI-NEXT: .LBB107_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v22 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v23 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s10, s29, 8 +; VI-NEXT: s_and_b32 s11, s28, 0xff +; VI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s25, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s24, 0xff +; VI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s11, s20, 0xff +; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; VI-NEXT: v_or_b32_e32 v3, v3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v9 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v11, v25, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s8, s11 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v11 +; VI-NEXT: s_or_b32 s7, s7, s11 +; VI-NEXT: s_and_b32 s13, s18, 0xff +; VI-NEXT: v_or_b32_e32 v2, v2, v5 +; VI-NEXT: s_lshl_b32 s6, s19, 24 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_and_b32 s12, s22, 0xff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 +; VI-NEXT: s_and_b32 s11, s26, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s8, 0xffff +; VI-NEXT: s_lshl_b32 s8, s12, 16 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s8, s11, 16 +; VI-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v17 +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v12, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s9, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v12 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s10, 0xffff +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s7, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: .LBB107_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB107_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_branch .LBB107_2 +; +; GFX9-LABEL: bitcast_v32i8_to_v16f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v22, v4 +; GFX9-NEXT: v_mov_b32_e32 v21, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_or_b32_sdwa v4, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB107_3 +; GFX9-NEXT: .LBB107_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v21 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v22 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v19 +; GFX9-NEXT: s_or_b32 s10, s11, s10 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: .LBB107_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB107_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_branch .LBB107_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v18 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v14, 16, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB107_3 +; GFX11-TRUE16-NEXT: .LBB107_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v6, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB107_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB107_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-TRUE16-NEXT: s_branch .LBB107_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v16 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v7, 16, v23 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB107_3 +; GFX11-FAKE16-NEXT: .LBB107_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v16 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v17 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v13, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v14, v6 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v7, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB107_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB107_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-FAKE16-NEXT: s_branch .LBB107_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + +define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { +; SI-LABEL: bitcast_v16bf16_to_v32i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v14 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB108_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB108_4 +; SI-NEXT: .LBB108_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB108_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_alignbit_b32 v8, v5, v48, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_alignbit_b32 v16, v5, v52, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_alignbit_b32 v0, v0, v36, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v34, 16 +; SI-NEXT: v_alignbit_b32 v12, v14, v38, 16 +; SI-NEXT: v_alignbit_b32 v20, v22, v50, 16 +; SI-NEXT: v_alignbit_b32 v24, v5, v55, 16 +; SI-NEXT: v_alignbit_b32 v28, v30, v53, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v33 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v49 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB108_2 +; SI-NEXT: .LBB108_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 +; SI-NEXT: v_alignbit_b32 v28, v30, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 +; SI-NEXT: v_alignbit_b32 v20, v22, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16bf16_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB108_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: .LBB108_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB54_4 +; VI-NEXT: s_cbranch_execz .LBB108_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -19894,7 +40214,7 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: .LBB54_4: ; %end +; VI-NEXT: .LBB108_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v1 ; VI-NEXT: v_mov_b32_e32 v8, v34 @@ -19942,7 +40262,7 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB54_2 +; GFX9-NEXT: s_cbranch_execz .LBB108_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -19968,9 +40288,9 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: .LBB54_2: ; %Flow +; GFX9-NEXT: .LBB108_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB54_4 +; GFX9-NEXT: s_cbranch_execz .LBB108_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -20126,7 +40446,7 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; GFX9-NEXT: .LBB54_4: ; %end +; GFX9-NEXT: .LBB108_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 @@ -20179,7 +40499,7 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[32:33], 24, v[26:27] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[18:19] @@ -20209,9 +40529,9 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v26.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v27.h -; GFX11-TRUE16-NEXT: .LBB54_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v3 @@ -20368,7 +40688,7 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; GFX11-TRUE16-NEXT: .LBB54_4: ; %end +; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h @@ -20427,7 +40747,7 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 @@ -20453,9 +40773,9 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB54_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB108_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v39 @@ -20472,160 +40792,1462 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v1, v9, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v37 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v8 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v7, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v9, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v9, v12, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v11, v6, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_add3_u32 v12, v13, v7, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v12, v13 :: v_dual_add_f32 v12, 0x40c00000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_add3_u32 v13, v15, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v3, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v13, v14 :: v_dual_add_f32 v14, 0x40c00000, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v16, v17, v12, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v9, v15, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add3_u32 v10, v19, v14, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v20, v9, v15 :: v_dual_add_f32 v9, 0x40c00000, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v16, v18, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v10, v15 :: v_dual_add_f32 v15, 0x40c00000, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_add3_u32 v16, v16, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v16, v18, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v16, v19, v15, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-FAKE16-NEXT: v_add3_u32 v21, v21, v17, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v16, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v32 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v18, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v21, v22, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v20, v13, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v9, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v14, v12, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 24, v10 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-FAKE16-NEXT: .LBB108_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v37 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v34 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, v35 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v24, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + +define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16bf16_to_v32i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v0 +; SI-NEXT: s_cbranch_scc0 .LBB109_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_alignbit_b32 v8, v5, v39, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v36 +; SI-NEXT: v_alignbit_b32 v16, v5, v51, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v52 +; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v33, 16 +; SI-NEXT: v_alignbit_b32 v12, v14, v37, 16 +; SI-NEXT: v_alignbit_b32 v20, v22, v49, 16 +; SI-NEXT: v_alignbit_b32 v24, v5, v55, 16 +; SI-NEXT: v_alignbit_b32 v28, v30, v53, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v36 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v48 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v52 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: s_cbranch_execnz .LBB109_3 +; SI-NEXT: .LBB109_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 +; SI-NEXT: v_alignbit_b32 v28, v30, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v38 +; SI-NEXT: v_alignbit_b32 v20, v22, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 +; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB109_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB109_2 +; +; VI-LABEL: bitcast_v16bf16_to_v32i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB109_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s23, 24 +; VI-NEXT: s_lshr_b32 s15, s23, 16 +; VI-NEXT: s_lshr_b32 s25, s23, 8 +; VI-NEXT: s_lshr_b32 s24, s22, 16 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: s_lshr_b32 s27, s21, 24 +; VI-NEXT: s_lshr_b32 s28, s21, 16 +; VI-NEXT: s_lshr_b32 s40, s21, 8 +; VI-NEXT: s_lshr_b32 s29, s20, 16 +; VI-NEXT: s_lshr_b32 s41, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s43, s19, 16 +; VI-NEXT: s_lshr_b32 s45, s19, 8 +; VI-NEXT: s_lshr_b32 s44, s18, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: s_lshr_b32 s47, s17, 24 +; VI-NEXT: s_lshr_b32 s56, s17, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 8 +; VI-NEXT: s_lshr_b32 s57, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB109_4 +; VI-NEXT: .LBB109_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v2 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v2 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; VI-NEXT: v_add_f32_e32 v0, s4, v2 +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v2 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v9, v4, v3, 16 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v2 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v8, v4, v3, 16 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v2 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v17, v4, v3, 16 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v2 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_alignbit_b32 v16, v4, v3, 16 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v2 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v25, v4, v3, 16 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v2, s4, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v24, v2, v3, 16 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v25 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; VI-NEXT: s_branch .LBB109_5 +; VI-NEXT: .LBB109_3: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB109_2 +; VI-NEXT: .LBB109_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s19 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v24, s22 +; VI-NEXT: v_mov_b32_e32 v25, s23 +; VI-NEXT: v_mov_b32_e32 v35, s59 +; VI-NEXT: v_mov_b32_e32 v2, s57 +; VI-NEXT: v_mov_b32_e32 v5, s58 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s47 +; VI-NEXT: v_mov_b32_e32 v34, s46 +; VI-NEXT: v_mov_b32_e32 v10, s44 +; VI-NEXT: v_mov_b32_e32 v13, s45 +; VI-NEXT: v_mov_b32_e32 v14, s43 +; VI-NEXT: v_mov_b32_e32 v15, s42 +; VI-NEXT: v_mov_b32_e32 v33, s41 +; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: v_mov_b32_e32 v21, s40 +; VI-NEXT: v_mov_b32_e32 v22, s28 +; VI-NEXT: v_mov_b32_e32 v23, s27 +; VI-NEXT: v_mov_b32_e32 v32, s26 +; VI-NEXT: v_mov_b32_e32 v26, s24 +; VI-NEXT: v_mov_b32_e32 v29, s25 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v31, s14 +; VI-NEXT: v_mov_b32_e32 v27, s10 +; VI-NEXT: v_mov_b32_e32 v19, s8 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB109_5: ; %end +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v12, v9 +; VI-NEXT: v_mov_b32_e32 v20, v17 +; VI-NEXT: v_mov_b32_e32 v28, v25 +; VI-NEXT: v_mov_b32_e32 v1, v35 +; VI-NEXT: v_mov_b32_e32 v9, v34 +; VI-NEXT: v_mov_b32_e32 v17, v33 +; VI-NEXT: v_mov_b32_e32 v25, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v32i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s42, s23, 24 +; GFX9-NEXT: s_lshr_b32 s59, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s23, 8 +; GFX9-NEXT: s_lshr_b32 s47, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s22, 8 +; GFX9-NEXT: s_lshr_b32 s28, s21, 24 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s41, s21, 8 +; GFX9-NEXT: s_lshr_b32 s44, s20, 16 +; GFX9-NEXT: s_lshr_b32 s43, s20, 8 +; GFX9-NEXT: s_lshr_b32 s24, s19, 24 +; GFX9-NEXT: s_lshr_b32 s57, s19, 16 +; GFX9-NEXT: s_lshr_b32 s27, s19, 8 +; GFX9-NEXT: s_lshr_b32 s40, s18, 16 +; GFX9-NEXT: s_lshr_b32 s29, s18, 8 +; GFX9-NEXT: s_lshr_b32 s14, s17, 24 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s15, s17, 8 +; GFX9-NEXT: s_lshr_b32 s26, s16, 16 +; GFX9-NEXT: s_lshr_b32 s25, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB109_4 +; GFX9-NEXT: .LBB109_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v5 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v5 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v6, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v5 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v5 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v5 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v7, vcc +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v5 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v14, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v5 +; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v1 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 +; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v7 +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v7 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v7 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v10, v22, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v7 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; GFX9-NEXT: v_add_f32_e32 v9, s4, v5 +; GFX9-NEXT: v_bfe_u32 v11, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v11, v11, v9 +; GFX9-NEXT: v_add_u32_e32 v11, 0x7fff, v11 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v16 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v9, v7, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 +; GFX9-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v11, v11, v7 +; GFX9-NEXT: v_add_u32_e32 v11, 0x7fff, v11 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 +; GFX9-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v11, v11, v7 +; GFX9-NEXT: v_add_u32_e32 v11, 0x7fff, v11 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc +; GFX9-NEXT: v_add_f32_e32 v11, s4, v5 +; GFX9-NEXT: v_bfe_u32 v12, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v12, v12, v11 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; GFX9-NEXT: v_bfe_u32 v12, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v12, v12, v5 +; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v12, v13, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v11, v11, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v12, v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v11 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v9 +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[3:4] +; GFX9-NEXT: s_branch .LBB109_5 +; GFX9-NEXT: .LBB109_3: +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr24 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: s_branch .LBB109_2 +; GFX9-NEXT: .LBB109_4: +; GFX9-NEXT: v_mov_b32_e32 v24, s22 +; GFX9-NEXT: v_mov_b32_e32 v32, s23 +; GFX9-NEXT: v_mov_b32_e32 v30, s59 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v33, s21 +; GFX9-NEXT: v_mov_b32_e32 v22, s58 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v34, s19 +; GFX9-NEXT: v_mov_b32_e32 v14, s57 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v35, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s56 +; GFX9-NEXT: v_mov_b32_e32 v26, s47 +; GFX9-NEXT: v_mov_b32_e32 v25, s46 +; GFX9-NEXT: v_mov_b32_e32 v31, s42 +; GFX9-NEXT: v_mov_b32_e32 v29, s45 +; GFX9-NEXT: v_mov_b32_e32 v18, s44 +; GFX9-NEXT: v_mov_b32_e32 v17, s43 +; GFX9-NEXT: v_mov_b32_e32 v23, s28 +; GFX9-NEXT: v_mov_b32_e32 v21, s41 +; GFX9-NEXT: v_mov_b32_e32 v10, s40 +; GFX9-NEXT: v_mov_b32_e32 v9, s29 +; GFX9-NEXT: v_mov_b32_e32 v15, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v7, s14 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v27, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s8 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB109_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v4, v35 +; GFX9-NEXT: v_mov_b32_e32 v12, v34 +; GFX9-NEXT: v_mov_b32_e32 v20, v33 +; GFX9-NEXT: v_mov_b32_e32 v28, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v32i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB109_4 +; GFX11-TRUE16-NEXT: .LBB109_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s3, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v10, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v6, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v10, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v12, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v14, 16, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s16, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v15, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v16, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v9, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v9, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v16, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v16, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v9, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, v21, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v24, v18 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v23, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v17, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v19, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v21, v26, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v15, v24, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v22, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v17, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v30, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v5, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v9, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v1, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[32:33], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-TRUE16-NEXT: s_branch .LBB109_5 +; GFX11-TRUE16-NEXT: .LBB109_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB109_2 +; GFX11-TRUE16-NEXT: .LBB109_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s4 +; GFX11-TRUE16-NEXT: .LBB109_5: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v35.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v34.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v33.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v32i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB109_4 +; GFX11-FAKE16-NEXT: .LBB109_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s3, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s2, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v1, v9, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v37 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v0, vcc_lo -; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v8 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v4, v10, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v7, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v4, v11 :: v_dual_add_nc_u32 v12, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v10, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v9, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v7, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v5, v4, 0x7060302 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v9, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v9, v12, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v11, v6, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v12 -; GFX11-FAKE16-NEXT: v_add3_u32 v12, v13, v7, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v12, v13 :: v_dual_add_f32 v12, 0x40c00000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX11-FAKE16-NEXT: v_add3_u32 v13, v15, v9, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v3, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v13, v14 :: v_dual_add_f32 v14, 0x40c00000, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v16, v17, v12, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v14, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v9, v15, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_add3_u32 v10, v19, v14, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v20, v9, v15 :: v_dual_add_f32 v9, 0x40c00000, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v14 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v16, v18, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v10, v15 :: v_dual_add_f32 v15, 0x40c00000, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v18 -; GFX11-FAKE16-NEXT: v_add3_u32 v16, v16, v9, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v14, 16, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s16, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_nc_u32 v7, v12, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v13, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v15, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v15 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v16, v18, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v16, v19, v15, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-FAKE16-NEXT: v_add3_u32 v21, v21, v17, 0x7fff -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v16, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v13, v17 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s18, 16 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v32 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v18, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v21, v22, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v20, v13, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v9, v23, vcc_lo -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v14, v12, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 24, v10 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[15:16] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX11-FAKE16-NEXT: .LBB54_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v36 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v37 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v34 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, v35 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v24, v32 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v13, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v18, v13 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v20, v16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v15, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v18, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v12, v20, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v22, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v15, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v30, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v5, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v7, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-FAKE16-NEXT: s_branch .LBB109_5 +; GFX11-FAKE16-NEXT: .LBB109_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: s_branch .LBB109_2 +; GFX11-FAKE16-NEXT: .LBB109_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v33, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v35, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s46 :: v_dual_mov_b32 v25, s41 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v31, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v29, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s3 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s44 :: v_dual_mov_b32 v21, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v9, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s43 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s42 :: v_dual_mov_b32 v13, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v1, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s25 :: v_dual_mov_b32 v7, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s21 :: v_dual_mov_b32 v5, s14 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v27, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v19, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB109_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v35 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v34 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, v33 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v32 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -20645,256 +42267,257 @@ end: } define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i8_to_v16bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v17 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v25 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v3, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v18, v18, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; GCN-NEXT: v_or_b32_e32 v31, v1, v0 -; GCN-NEXT: v_or_b32_e32 v35, v36, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v37, v4 -; GCN-NEXT: v_or_b32_e32 v33, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v38, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v8 -; GCN-NEXT: v_or_b32_e32 v7, v39, v9 -; GCN-NEXT: v_or_b32_e32 v23, v11, v10 -; GCN-NEXT: v_or_b32_e32 v27, v48, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v13 -; GCN-NEXT: v_or_b32_e32 v11, v49, v14 -; GCN-NEXT: v_or_b32_e32 v32, v16, v15 -; GCN-NEXT: v_or_b32_e32 v13, v50, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v18 -; GCN-NEXT: v_or_b32_e32 v15, v51, v20 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: .LBB55_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v28 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v30 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v25 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v55, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v54, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v19 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; GCN-NEXT: v_or_b32_e32 v12, v53, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; GCN-NEXT: v_or_b32_e32 v4, v52, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v3 -; GCN-NEXT: v_or_b32_e32 v3, v51, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v7 -; GCN-NEXT: v_or_b32_e32 v7, v50, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v13 -; GCN-NEXT: v_or_b32_e32 v11, v49, v15 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v16 -; GCN-NEXT: v_or_b32_e32 v15, v48, v17 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_or_b32_e32 v14, v39, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v38, v9 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v6, v37, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v5 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v15, v13 -; GCN-NEXT: v_or_b32_e32 v10, v14, v12 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_or_b32_e32 v4, v6, v4 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v10 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v6 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; GCN-NEXT: .LBB55_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v31 -; GCN-NEXT: v_mov_b32_e32 v1, v35 -; GCN-NEXT: v_mov_b32_e32 v2, v21 -; GCN-NEXT: v_mov_b32_e32 v4, v33 -; GCN-NEXT: v_mov_b32_e32 v6, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v23 -; GCN-NEXT: v_mov_b32_e32 v9, v27 -; GCN-NEXT: v_mov_b32_e32 v10, v29 -; GCN-NEXT: v_mov_b32_e32 v12, v32 -; GCN-NEXT: v_mov_b32_e32 v14, v34 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i8_to_v16bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_mov_b32_e32 v36, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v33 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; SI-NEXT: v_or_b32_e32 v31, v4, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v5, v39, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v2, v2, v51 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v7, v49, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v17 +; SI-NEXT: v_or_b32_e32 v23, v4, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v27, v50, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v11, v21, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v25 +; SI-NEXT: v_or_b32_e32 v32, v4, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v13, v52, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v55 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v15, v54, v2 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: .LBB110_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v12 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v28 +; SI-NEXT: .LBB110_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v35 +; SI-NEXT: v_mov_b32_e32 v4, v31 +; SI-NEXT: v_mov_b32_e32 v6, v19 +; SI-NEXT: v_mov_b32_e32 v8, v23 +; SI-NEXT: v_mov_b32_e32 v9, v27 +; SI-NEXT: v_mov_b32_e32 v10, v29 +; SI-NEXT: v_mov_b32_e32 v12, v32 +; SI-NEXT: v_mov_b32_e32 v14, v34 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v16bf16: ; VI: ; %bb.0: @@ -20927,14 +42550,14 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: s_cbranch_execnz .LBB110_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB55_4 -; VI-NEXT: .LBB55_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB110_4 +; VI-NEXT: .LBB110_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB55_3: ; %cmp.false +; VI-NEXT: .LBB110_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -20992,8 +42615,8 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB55_2 -; VI-NEXT: .LBB55_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB110_2 +; VI-NEXT: .LBB110_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v30 ; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v1, 0x300 @@ -21085,14 +42708,14 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: s_cbranch_execnz .LBB110_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB55_4 -; GFX9-NEXT: .LBB55_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB110_4 +; GFX9-NEXT: .LBB110_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB55_3: ; %cmp.false +; GFX9-NEXT: .LBB110_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 @@ -21151,8 +42774,8 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB55_2 -; GFX9-NEXT: .LBB55_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB110_2 +; GFX9-NEXT: .LBB110_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 ; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 @@ -21250,14 +42873,14 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_4 -; GFX11-TRUE16-NEXT: .LBB55_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_4 +; GFX11-TRUE16-NEXT: .LBB110_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB55_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.h @@ -21323,8 +42946,8 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB55_2 -; GFX11-TRUE16-NEXT: .LBB55_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 +; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v28.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v24.l, 3 @@ -21424,14 +43047,14 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB110_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_4 -; GFX11-FAKE16-NEXT: .LBB55_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB110_4 +; GFX11-FAKE16-NEXT: .LBB110_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB55_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v33 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 @@ -21505,8 +43128,8 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB55_2 -; GFX11-FAKE16-NEXT: .LBB55_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB110_2 +; GFX11-FAKE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v28, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v30, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v24, 3 @@ -21597,3 +43220,941 @@ end: %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <16 x bfloat> %phi } + +define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i8_to_v16bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_readfirstlane_b32 s42, v15 +; SI-NEXT: v_readfirstlane_b32 s43, v14 +; SI-NEXT: v_readfirstlane_b32 s40, v7 +; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_readfirstlane_b32 s10, v1 +; SI-NEXT: v_readfirstlane_b32 s9, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v17 +; SI-NEXT: s_cbranch_scc0 .LBB111_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s17, 24 +; SI-NEXT: s_or_b32 s6, s5, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_or_b32 s11, s5, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s25, 24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v3 +; SI-NEXT: s_or_b32 s12, s5, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: v_or_b32_e32 v17, v9, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v4 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_or_b32 s13, s5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_or_b32_e32 v9, v0, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_lshl_b32 s14, s4, 16 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: v_or_b32_e32 v19, v1, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v10 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s10, 24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v11 +; SI-NEXT: s_or_b32 s15, s5, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: v_or_b32_e32 v18, v13, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v12 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_lshl_b32 s44, s4, 16 +; SI-NEXT: v_or_b32_e32 v13, v5, v7 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_lshl_b32 s45, s4, 16 +; SI-NEXT: v_or_b32_e32 v15, v6, v7 +; SI-NEXT: s_cbranch_execnz .LBB111_4 +; SI-NEXT: .LBB111_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v10 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v11 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v12 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s9, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s10, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s22, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 +; SI-NEXT: s_add_i32 s8, s6, 0x3000000 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s7, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s9, s18, 0xff +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s11, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s13, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s5, 16 +; SI-NEXT: s_and_b32 s15, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s4, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 +; SI-NEXT: s_branch .LBB111_5 +; SI-NEXT: .LBB111_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB111_2 +; SI-NEXT: .LBB111_4: +; SI-NEXT: v_mov_b32_e32 v10, s44 +; SI-NEXT: v_mov_b32_e32 v14, s45 +; SI-NEXT: .LBB111_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, v17 +; SI-NEXT: v_mov_b32_e32 v11, v19 +; SI-NEXT: v_mov_b32_e32 v12, v18 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i8_to_v16bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v24, v6 +; VI-NEXT: v_mov_b32_e32 v20, v5 +; VI-NEXT: v_mov_b32_e32 v23, v4 +; VI-NEXT: v_mov_b32_e32 v19, v2 +; VI-NEXT: v_mov_b32_e32 v21, v1 +; VI-NEXT: v_mov_b32_e32 v22, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: s_cbranch_scc0 .LBB111_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v20 +; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 +; VI-NEXT: v_or_b32_sdwa v2, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB111_3 +; VI-NEXT: .LBB111_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v22 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v23 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s10, s29, 8 +; VI-NEXT: s_and_b32 s11, s28, 0xff +; VI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s25, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s24, 0xff +; VI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s11, s20, 0xff +; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; VI-NEXT: v_or_b32_e32 v3, v3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v9 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v11, v25, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s8, s11 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v11 +; VI-NEXT: s_or_b32 s7, s7, s11 +; VI-NEXT: s_and_b32 s13, s18, 0xff +; VI-NEXT: v_or_b32_e32 v2, v2, v5 +; VI-NEXT: s_lshl_b32 s6, s19, 24 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_and_b32 s12, s22, 0xff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 +; VI-NEXT: s_and_b32 s11, s26, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s8, 0xffff +; VI-NEXT: s_lshl_b32 s8, s12, 16 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s8, s11, 16 +; VI-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v17 +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v12, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s9, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v12 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s10, 0xffff +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s7, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: .LBB111_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB111_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_branch .LBB111_2 +; +; GFX9-LABEL: bitcast_v32i8_to_v16bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v22, v4 +; GFX9-NEXT: v_mov_b32_e32 v21, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_or_b32_sdwa v4, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB111_3 +; GFX9-NEXT: .LBB111_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v21 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v22 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v19 +; GFX9-NEXT: s_or_b32 s10, s11, s10 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: .LBB111_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB111_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_branch .LBB111_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16bf16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v18 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v14, 16, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX11-TRUE16-NEXT: .LBB111_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v6, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB111_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB111_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-TRUE16-NEXT: s_branch .LBB111_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16bf16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v16 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v7, 16, v23 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX11-FAKE16-NEXT: .LBB111_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v16 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v17 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v13, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v14, v6 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v7, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB111_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB111_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-FAKE16-NEXT: s_branch .LBB111_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll index 86ac4354e3f17..eebfb11613d85 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll @@ -1,32 +1,32 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <9 x float> @bitcast_v9i32_to_v9f32(<9 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v9i32_to_v9f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9i32_to_v9f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i32_to_v9f32: ; VI: ; %bb.0: @@ -109,28 +109,167 @@ end: ret <9 x float> %phi } +define inreg <9 x float> @bitcast_v9i32_to_v9f32_scalar(<9 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9i32_to_v9f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s25, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v9i32_to_v9f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v9i32_to_v9f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v9i32_to_v9f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i32> %a, splat (i32 3) + %a2 = bitcast <9 x i32> %a1 to <9 x float> + br label %end + +cmp.false: + %a3 = bitcast <9 x i32> %a to <9 x float> + br label %end + +end: + %phi = phi <9 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x float> %phi +} + define <9 x i32> @bitcast_v9f32_to_v9i32(<9 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v9f32_to_v9i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9f32_to_v9i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f32_to_v9i32: ; VI: ; %bb.0: @@ -208,70 +347,217 @@ end: ret <9 x i32> %phi } +define inreg <9 x i32> @bitcast_v9f32_to_v9i32_scalar(<9 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9f32_to_v9i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s25, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v9f32_to_v9i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v9f32_to_v9i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v9f32_to_v9i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <9 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <9 x float> %a1 to <9 x i32> + br label %end + +cmp.false: + %a3 = bitcast <9 x float> %a to <9 x i32> + br label %end + +end: + %phi = phi <9 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i32> %phi +} + define <18 x i16> @bitcast_v9i32_to_v18i16(<9 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v9i32_to_v18i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB2_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v17, s4, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v17, s4, v16, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9i32_to_v18i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_4 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB4_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, s4, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: .LBB4_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, s4, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i32_to_v18i16: ; VI: ; %bb.0: @@ -323,7 +609,7 @@ define <18 x i16> @bitcast_v9i32_to_v18i16(<9 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 ; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 @@ -334,7 +620,7 @@ define <18 x i16> @bitcast_v9i32_to_v18i16(<9 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -354,114 +640,295 @@ end: ret <18 x i16> %phi } +define inreg <18 x i16> @bitcast_v9i32_to_v18i16_scalar(<9 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9i32_to_v18i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s25, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: s_lshr_b32 s7, s21, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s4, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: s_lshr_b32 s7, s21, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s7 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v9i32_to_v18i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v9i32_to_v18i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v9i32_to_v18i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i32> %a, splat (i32 3) + %a2 = bitcast <9 x i32> %a1 to <18 x i16> + br label %end + +cmp.false: + %a3 = bitcast <9 x i32> %a to <18 x i16> + br label %end + +end: + %phi = phi <18 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x i16> %phi +} + define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v18i16_to_v9i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v23, v8 -; GCN-NEXT: v_mov_b32_e32 v22, v6 -; GCN-NEXT: v_mov_b32_e32 v21, v4 -; GCN-NEXT: v_mov_b32_e32 v20, v2 -; GCN-NEXT: v_mov_b32_e32 v19, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v0, v0, v25 -; GCN-NEXT: v_or_b32_e32 v1, v1, v26 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v2, v2, v18 -; GCN-NEXT: v_or_b32_e32 v3, v3, v24 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v0, v25, v0 -; GCN-NEXT: v_or_b32_e32 v1, v26, v1 -; GCN-NEXT: v_or_b32_e32 v2, v18, v2 -; GCN-NEXT: v_or_b32_e32 v3, v24, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18i16_to_v9i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v8 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v26 +; SI-NEXT: v_or_b32_e32 v3, v3, v25 +; SI-NEXT: v_or_b32_e32 v4, v4, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v2, v26, v2 +; SI-NEXT: v_or_b32_e32 v3, v25, v3 +; SI-NEXT: v_or_b32_e32 v4, v24, v4 +; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i16_to_v9i32: ; VI: ; %bb.0: @@ -470,7 +937,7 @@ define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v10, 3 ; VI-NEXT: v_add_u16_e32 v9, 3, v8 @@ -500,7 +967,7 @@ define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v9, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v9, v0 -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -533,7 +1000,7 @@ define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] @@ -544,7 +1011,7 @@ define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -564,125 +1031,376 @@ end: ret <9 x i32> %phi } +define inreg <9 x i32> @bitcast_v18i16_to_v9i32_scalar(<18 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18i16_to_v9i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v7, v0, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v8, v0, v11 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v18i16_to_v9i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v18i16_to_v9i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v18i16_to_v9i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: s_branch .LBB7_2 +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <18 x i16> %a, splat (i16 3) + %a2 = bitcast <18 x i16> %a1 to <9 x i32> + br label %end + +cmp.false: + %a3 = bitcast <18 x i16> %a to <9 x i32> + br label %end + +end: + %phi = phi <9 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i32> %phi +} + define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v9i32_to_v18f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v26, v8 -; GCN-NEXT: v_mov_b32_e32 v25, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v23, v5 -; GCN-NEXT: v_mov_b32_e32 v22, v4 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v20, v2 -; GCN-NEXT: v_mov_b32_e32 v19, v1 -; GCN-NEXT: v_mov_b32_e32 v18, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_4 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB4_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: .LBB4_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9i32_to_v18f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v25, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v23, v5 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v21, v3 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v1 +; SI-NEXT: v_mov_b32_e32 v18, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_4 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB8_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: .LBB8_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i32_to_v18f16: ; VI: ; %bb.0: @@ -734,7 +1452,7 @@ define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 ; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 @@ -745,7 +1463,7 @@ define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -765,153 +1483,356 @@ end: ret <18 x half> %phi } +define inreg <18 x half> @bitcast_v9i32_to_v18f16_scalar(<9 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9i32_to_v18f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s25, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v9i32_to_v18f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v9i32_to_v18f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v9i32_to_v18f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i32> %a, splat (i32 3) + %a2 = bitcast <9 x i32> %a1 to <18 x half> + br label %end + +cmp.false: + %a3 = bitcast <9 x i32> %a to <18 x half> + br label %end + +end: + %phi = phi <18 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x half> %phi +} + define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v18f16_to_v9i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v16 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_4 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB5_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: v_or_b32_e32 v0, v29, v0 -; GCN-NEXT: v_or_b32_e32 v1, v27, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GCN-NEXT: v_or_b32_e32 v2, v23, v2 -; GCN-NEXT: v_or_b32_e32 v3, v21, v3 -; GCN-NEXT: v_or_b32_e32 v4, v19, v4 -; GCN-NEXT: v_or_b32_e32 v5, v18, v5 -; GCN-NEXT: v_or_b32_e32 v6, v12, v6 -; GCN-NEXT: v_or_b32_e32 v7, v10, v7 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: .LBB5_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v27 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v14, v8 -; GCN-NEXT: v_or_b32_e32 v6, v12, v15 -; GCN-NEXT: v_or_b32_e32 v7, v10, v13 -; GCN-NEXT: v_or_b32_e32 v8, v9, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18f16_to_v9i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v2, v26, v2 +; SI-NEXT: v_or_b32_e32 v3, v24, v3 +; SI-NEXT: v_or_b32_e32 v4, v22, v4 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f16_to_v9i32: ; VI: ; %bb.0: @@ -920,7 +1841,7 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v9, 0x200 ; VI-NEXT: v_add_f16_sdwa v10, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -950,7 +1871,7 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v10 ; VI-NEXT: v_or_b32_e32 v0, v0, v9 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -961,7 +1882,7 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] @@ -973,7 +1894,7 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -985,7 +1906,7 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] @@ -996,7 +1917,7 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1016,70 +1937,346 @@ end: ret <9 x i32> %phi } +define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18f16_to_v9i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 +; SI-NEXT: v_or_b32_e32 v3, v19, v3 +; SI-NEXT: v_or_b32_e32 v4, v17, v4 +; SI-NEXT: v_or_b32_e32 v5, v15, v5 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v18f16_to_v9i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v9, v1 +; VI-NEXT: v_mov_b32_e32 v9, s4 +; VI-NEXT: v_add_f16_sdwa v9, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v18f16_to_v9i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v18f16_to_v9i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <18 x half> %a, splat (half 0xH0200) + %a2 = bitcast <18 x half> %a1 to <9 x i32> + br label %end + +cmp.false: + %a3 = bitcast <18 x half> %a to <9 x i32> + br label %end + +end: + %phi = phi <9 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i32> %phi +} + define <18 x i16> @bitcast_v9f32_to_v18i16(<9 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v9f32_to_v18i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_4 -; GCN-NEXT: .LBB6_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB6_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v17, s4, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: .LBB6_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v17, s4, v16, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9f32_to_v18i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_4 +; SI-NEXT: .LBB12_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB12_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, s4, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: .LBB12_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, s4, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f32_to_v18i16: ; VI: ; %bb.0: @@ -1157,114 +2354,314 @@ end: ret <18 x i16> %phi } +define inreg <18 x i16> @bitcast_v9f32_to_v18i16_scalar(<9 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9f32_to_v18i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s25, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_4 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, s4, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_branch .LBB13_2 +; SI-NEXT: .LBB13_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v9f32_to_v18i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_4 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_3: +; VI-NEXT: s_branch .LBB13_2 +; VI-NEXT: .LBB13_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v9f32_to_v18i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_4 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_3: +; GFX9-NEXT: s_branch .LBB13_2 +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v9f32_to_v18i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_4 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_3: +; GFX11-NEXT: s_branch .LBB13_2 +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <9 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <9 x float> %a1 to <18 x i16> + br label %end + +cmp.false: + %a3 = bitcast <9 x float> %a to <18 x i16> + br label %end + +end: + %phi = phi <18 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x i16> %phi +} + define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v18i16_to_v9f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v23, v8 -; GCN-NEXT: v_mov_b32_e32 v22, v6 -; GCN-NEXT: v_mov_b32_e32 v21, v4 -; GCN-NEXT: v_mov_b32_e32 v20, v2 -; GCN-NEXT: v_mov_b32_e32 v19, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v0, v0, v25 -; GCN-NEXT: v_or_b32_e32 v1, v1, v26 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v2, v2, v18 -; GCN-NEXT: v_or_b32_e32 v3, v3, v24 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v0, v25, v0 -; GCN-NEXT: v_or_b32_e32 v1, v26, v1 -; GCN-NEXT: v_or_b32_e32 v2, v18, v2 -; GCN-NEXT: v_or_b32_e32 v3, v24, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18i16_to_v9f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v8 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v26 +; SI-NEXT: v_or_b32_e32 v3, v3, v25 +; SI-NEXT: v_or_b32_e32 v4, v4, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v2, v26, v2 +; SI-NEXT: v_or_b32_e32 v3, v25, v3 +; SI-NEXT: v_or_b32_e32 v4, v24, v4 +; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i16_to_v9f32: ; VI: ; %bb.0: @@ -1273,7 +2670,7 @@ define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v10, 3 ; VI-NEXT: v_add_u16_e32 v9, 3, v8 @@ -1303,7 +2700,7 @@ define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v9, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v9, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1336,7 +2733,7 @@ define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] @@ -1347,7 +2744,7 @@ define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: .LBB14_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1367,125 +2764,376 @@ end: ret <9 x float> %phi } +define inreg <9 x float> @bitcast_v18i16_to_v9f32_scalar(<18 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18i16_to_v9f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v7, v0, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v8, v0, v11 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v18i16_to_v9f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v18i16_to_v9f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v18i16_to_v9f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <18 x i16> %a, splat (i16 3) + %a2 = bitcast <18 x i16> %a1 to <9 x float> + br label %end + +cmp.false: + %a3 = bitcast <18 x i16> %a to <9 x float> + br label %end + +end: + %phi = phi <9 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x float> %phi +} + define <18 x half> @bitcast_v9f32_to_v18f16(<9 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v9f32_to_v18f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v26, v8 -; GCN-NEXT: v_mov_b32_e32 v25, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v23, v5 -; GCN-NEXT: v_mov_b32_e32 v22, v4 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v20, v2 -; GCN-NEXT: v_mov_b32_e32 v19, v1 -; GCN-NEXT: v_mov_b32_e32 v18, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9f32_to_v18f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v25, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v23, v5 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v21, v3 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v1 +; SI-NEXT: v_mov_b32_e32 v18, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f32_to_v18f16: ; VI: ; %bb.0: @@ -1563,153 +3211,379 @@ end: ret <18 x half> %phi } +define inreg <18 x half> @bitcast_v9f32_to_v18f16_scalar(<9 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9f32_to_v18f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s25, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s24, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v9f32_to_v18f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_4 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_3: +; VI-NEXT: s_branch .LBB17_2 +; VI-NEXT: .LBB17_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v9f32_to_v18f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_4 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_3: +; GFX9-NEXT: s_branch .LBB17_2 +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v9f32_to_v18f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_4 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_3: +; GFX11-NEXT: s_branch .LBB17_2 +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <9 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <9 x float> %a1 to <18 x half> + br label %end + +cmp.false: + %a3 = bitcast <9 x float> %a to <18 x half> + br label %end + +end: + %phi = phi <18 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x half> %phi +} + define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v18f16_to_v9f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v16 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: v_or_b32_e32 v0, v29, v0 -; GCN-NEXT: v_or_b32_e32 v1, v27, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GCN-NEXT: v_or_b32_e32 v2, v23, v2 -; GCN-NEXT: v_or_b32_e32 v3, v21, v3 -; GCN-NEXT: v_or_b32_e32 v4, v19, v4 -; GCN-NEXT: v_or_b32_e32 v5, v18, v5 -; GCN-NEXT: v_or_b32_e32 v6, v12, v6 -; GCN-NEXT: v_or_b32_e32 v7, v10, v7 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v27 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v14, v8 -; GCN-NEXT: v_or_b32_e32 v6, v12, v15 -; GCN-NEXT: v_or_b32_e32 v7, v10, v13 -; GCN-NEXT: v_or_b32_e32 v8, v9, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18f16_to_v9f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v2, v26, v2 +; SI-NEXT: v_or_b32_e32 v3, v24, v3 +; SI-NEXT: v_or_b32_e32 v4, v22, v4 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f16_to_v9f32: ; VI: ; %bb.0: @@ -1718,7 +3592,7 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v9, 0x200 ; VI-NEXT: v_add_f16_sdwa v10, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1748,7 +3622,7 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v10 ; VI-NEXT: v_or_b32_e32 v0, v0, v9 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1759,7 +3633,7 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] @@ -1771,7 +3645,7 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: .LBB18_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1783,7 +3657,7 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] @@ -1794,7 +3668,7 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: .LBB18_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1814,134 +3688,410 @@ end: ret <9 x float> %phi } +define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18f16_to_v9f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 +; SI-NEXT: v_or_b32_e32 v3, v19, v3 +; SI-NEXT: v_or_b32_e32 v4, v17, v4 +; SI-NEXT: v_or_b32_e32 v5, v15, v5 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v18f16_to_v9f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v9, v1 +; VI-NEXT: v_mov_b32_e32 v9, s4 +; VI-NEXT: v_add_f16_sdwa v9, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v18f16_to_v9f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v18f16_to_v9f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <18 x half> %a, splat (half 0xH0200) + %a2 = bitcast <18 x half> %a1 to <9 x float> + br label %end + +cmp.false: + %a3 = bitcast <18 x half> %a to <9 x float> + br label %end + +end: + %phi = phi <9 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x float> %phi +} + define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v18i16_to_v18f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v35, v17 -; GCN-NEXT: v_mov_b32_e32 v34, v16 -; GCN-NEXT: v_mov_b32_e32 v33, v15 -; GCN-NEXT: v_mov_b32_e32 v32, v14 -; GCN-NEXT: v_mov_b32_e32 v31, v13 -; GCN-NEXT: v_mov_b32_e32 v30, v12 -; GCN-NEXT: v_mov_b32_e32 v29, v11 -; GCN-NEXT: v_mov_b32_e32 v28, v10 -; GCN-NEXT: v_mov_b32_e32 v27, v9 -; GCN-NEXT: v_mov_b32_e32 v26, v8 -; GCN-NEXT: v_mov_b32_e32 v25, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v23, v5 -; GCN-NEXT: v_mov_b32_e32 v22, v4 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v20, v2 -; GCN-NEXT: v_mov_b32_e32 v19, v1 -; GCN-NEXT: v_mov_b32_e32 v36, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18i16_to_v18f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v17 +; SI-NEXT: v_mov_b32_e32 v34, v16 +; SI-NEXT: v_mov_b32_e32 v33, v15 +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: v_mov_b32_e32 v31, v13 +; SI-NEXT: v_mov_b32_e32 v30, v12 +; SI-NEXT: v_mov_b32_e32 v29, v11 +; SI-NEXT: v_mov_b32_e32 v28, v10 +; SI-NEXT: v_mov_b32_e32 v27, v9 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v25, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v23, v5 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v21, v3 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v1 +; SI-NEXT: v_mov_b32_e32 v36, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i16_to_v18f16: ; VI: ; %bb.0: @@ -1950,7 +4100,7 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v9, 3 ; VI-NEXT: v_add_u16_sdwa v10, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1980,7 +4130,7 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v12 ; VI-NEXT: v_or_b32_e32 v1, v1, v11 ; VI-NEXT: v_or_b32_e32 v0, v0, v10 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2013,7 +4163,7 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] @@ -2024,7 +4174,7 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2044,113 +4194,363 @@ end: ret <18 x half> %phi } +define inreg <18 x half> @bitcast_v18i16_to_v18f16_scalar(<18 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18i16_to_v18f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v20, v1 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v18 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v19 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v18i16_to_v18f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v18i16_to_v18f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_4 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_3: +; GFX9-NEXT: s_branch .LBB21_2 +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v18i16_to_v18f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: s_branch .LBB21_2 +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <18 x i16> %a, splat (i16 3) + %a2 = bitcast <18 x i16> %a1 to <18 x half> + br label %end + +cmp.false: + %a3 = bitcast <18 x i16> %a to <18 x half> + br label %end + +end: + %phi = phi <18 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x half> %phi +} + define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v18f16_to_v18i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v15 -; GCN-NEXT: v_or_b32_e32 v16, v16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; GCN-NEXT: v_or_b32_e32 v14, v14, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v7 -; GCN-NEXT: v_or_b32_e32 v10, v10, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_or_b32_e32 v6, v6, v19 -; GCN-NEXT: v_or_b32_e32 v2, v2, v18 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18f16_to_v18i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v14, v14, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f16_to_v18i16: ; VI: ; %bb.0: @@ -2159,7 +4559,7 @@ define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v10, 0x200 ; VI-NEXT: v_add_f16_e32 v9, 0x200, v0 @@ -2189,7 +4589,7 @@ define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v12, v2 ; VI-NEXT: v_or_b32_e32 v1, v11, v1 ; VI-NEXT: v_or_b32_e32 v0, v9, v0 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2200,7 +4600,7 @@ define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] @@ -2212,7 +4612,7 @@ define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2224,7 +4624,7 @@ define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] @@ -2235,7 +4635,7 @@ define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2254,3 +4654,289 @@ end: %phi = phi <18 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <18 x i16> %phi } + +define inreg <18 x i16> @bitcast_v18f16_to_v18i16_scalar(<18 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18f16_to_v18i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v14, v14, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v18f16_to_v18i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s5, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s5, s24, 16 +; VI-NEXT: v_add_f16_e32 v1, s23, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_lshr_b32 s5, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s22, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s21, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v3, s24, v0 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s20, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s19, 16 +; VI-NEXT: v_or_b32_e32 v8, v3, v4 +; VI-NEXT: v_or_b32_e32 v4, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s19, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_add_f16_e32 v1, s18, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v1, v2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_add_f16_sdwa v10, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v9, s16, v0 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s17, v0 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v9, v10 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v18f16_to_v18i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v18f16_to_v18i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: s_branch .LBB23_2 +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <18 x half> %a, splat (half 0xH0200) + %a2 = bitcast <18 x half> %a1 to <18 x i16> + br label %end + +cmp.false: + %a3 = bitcast <18 x half> %a to <18 x i16> + br label %end + +end: + %phi = phi <18 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x i16> %phi +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FAKE16: {{.*}} +; GFX11-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 6e6e62c4b05ad..1c6a2b24b1242 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -1,34 +1,34 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <10 x float> @bitcast_v10i32_to_v10f32(<10 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i32_to_v10f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10i32_to_v10f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i32_to_v10f32: ; VI: ; %bb.0: @@ -116,29 +116,176 @@ end: ret <10 x float> %phi } +define inreg <10 x float> @bitcast_v10i32_to_v10f32_scalar(<10 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i32_to_v10f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v10i32_to_v10f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v10i32_to_v10f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v10i32_to_v10f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + define <10 x i32> @bitcast_v10f32_to_v10i32(<10 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f32_to_v10i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f32_to_v10i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f32_to_v10i32: ; VI: ; %bb.0: @@ -147,7 +294,7 @@ define <10 x i32> @bitcast_v10f32_to_v10i32(<10 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 @@ -159,7 +306,7 @@ define <10 x i32> @bitcast_v10f32_to_v10i32(<10 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -170,7 +317,7 @@ define <10 x i32> @bitcast_v10f32_to_v10i32(<10 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 ; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 @@ -182,7 +329,7 @@ define <10 x i32> @bitcast_v10f32_to_v10i32(<10 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -220,145 +367,235 @@ end: ret <10 x i32> %phi } -define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i32_to_v20f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v29, v9 -; GCN-NEXT: v_mov_b32_e32 v28, v8 -; GCN-NEXT: v_mov_b32_e32 v27, v7 -; GCN-NEXT: v_mov_b32_e32 v26, v6 -; GCN-NEXT: v_mov_b32_e32 v25, v5 -; GCN-NEXT: v_mov_b32_e32 v24, v4 -; GCN-NEXT: v_mov_b32_e32 v23, v3 -; GCN-NEXT: v_mov_b32_e32 v22, v2 -; GCN-NEXT: v_mov_b32_e32 v21, v1 -; GCN-NEXT: v_mov_b32_e32 v20, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB2_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <10 x i32> @bitcast_v10f32_to_v10i32_scalar(<10 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f32_to_v10i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v10i32_to_v20f16: +; VI-LABEL: bitcast_v10f32_to_v10i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v10i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v10i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define <20 x i16> @bitcast_v10i32_to_v20i16(<10 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v10i32_to_v20i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB4_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB4_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v10, v20 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i32_to_v20i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 @@ -370,18 +607,18 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v10i32_to_v20f16: +; GFX9-LABEL: bitcast_v10i32_to_v20i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 ; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 @@ -393,11 +630,11 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v10i32_to_v20f16: +; GFX11-LABEL: bitcast_v10i32_to_v20i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -405,7 +642,7 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 ; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 @@ -417,7 +654,7 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -425,249 +662,396 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { cmp.true: %a1 = add <10 x i32> %a, splat (i32 3) - %a2 = bitcast <10 x i32> %a1 to <20 x half> + %a2 = bitcast <10 x i32> %a1 to <20 x i16> br label %end cmp.false: - %a3 = bitcast <10 x i32> %a to <20 x half> + %a3 = bitcast <10 x i32> %a to <20 x i16> br label %end end: - %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <20 x half> %phi + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi } -define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f16_to_v10i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v18 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v13 -; GCN-NEXT: v_or_b32_e32 v0, v29, v0 -; GCN-NEXT: v_or_b32_e32 v1, v27, v1 -; GCN-NEXT: v_or_b32_e32 v2, v25, v2 -; GCN-NEXT: v_or_b32_e32 v3, v23, v3 -; GCN-NEXT: v_or_b32_e32 v4, v22, v4 -; GCN-NEXT: v_or_b32_e32 v5, v21, v5 -; GCN-NEXT: v_or_b32_e32 v6, v20, v6 -; GCN-NEXT: v_or_b32_e32 v7, v12, v7 -; GCN-NEXT: v_or_b32_e32 v8, v11, v8 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v17, v16 -; GCN-NEXT: v_or_b32_e32 v6, v19, v18 -; GCN-NEXT: v_or_b32_e32 v7, v12, v15 -; GCN-NEXT: v_or_b32_e32 v8, v11, v14 -; GCN-NEXT: v_or_b32_e32 v9, v10, v13 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <20 x i16> @bitcast_v10i32_to_v20i16_scalar(<10 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i32_to_v20i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB5_2 ; -; VI-LABEL: bitcast_v20f16_to_v10i32: +; VI-LABEL: bitcast_v10i32_to_v20i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v10i32_to_v20i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v10i32_to_v20i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <20 x i16> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <20 x i16> + br label %end + +end: + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi +} + +define <10 x i32> @bitcast_v20i16_to_v10i32(<20 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v20i16_to_v10i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v25, v8 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_or_b32_e32 v2, v2, v29 +; SI-NEXT: v_or_b32_e32 v3, v3, v28 +; SI-NEXT: v_or_b32_e32 v4, v4, v27 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_or_b32_e32 v7, v7, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_or_b32_e32 v2, v29, v2 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_or_b32_e32 v4, v27, v4 +; SI-NEXT: v_or_b32_e32 v5, v26, v5 +; SI-NEXT: v_or_b32_e32 v6, v20, v6 +; SI-NEXT: v_or_b32_e32 v7, v15, v7 +; SI-NEXT: v_or_b32_e32 v8, v13, v8 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20i16_to_v10i32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v10, 0x200 -; VI-NEXT: v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 -; VI-NEXT: v_or_b32_e32 v9, v9, v11 -; VI-NEXT: v_add_f16_sdwa v11, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v11 -; VI-NEXT: v_add_f16_sdwa v11, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v11 -; VI-NEXT: v_add_f16_sdwa v11, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 -; VI-NEXT: v_or_b32_e32 v6, v6, v11 -; VI-NEXT: v_add_f16_sdwa v11, v5, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v11 -; VI-NEXT: v_add_f16_sdwa v11, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 -; VI-NEXT: v_or_b32_e32 v4, v4, v11 -; VI-NEXT: v_add_f16_sdwa v11, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v11 -; VI-NEXT: v_add_f16_sdwa v11, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_or_b32_e32 v2, v2, v11 -; VI-NEXT: v_add_f16_sdwa v11, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: v_add_f16_sdwa v10, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 -; VI-NEXT: v_or_b32_e32 v1, v1, v11 -; VI-NEXT: v_or_b32_e32 v0, v0, v10 -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: v_mov_b32_e32 v11, 3 +; VI-NEXT: v_add_u16_e32 v10, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_u16_e32 v10, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v10, v8 +; VI-NEXT: v_add_u16_e32 v10, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v10, v7 +; VI-NEXT: v_add_u16_e32 v10, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v10, v6 +; VI-NEXT: v_add_u16_e32 v10, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v10, v5 +; VI-NEXT: v_add_u16_e32 v10, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v10, v4 +; VI-NEXT: v_add_u16_e32 v10, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v10, v3 +; VI-NEXT: v_add_u16_e32 v10, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v10, v2 +; VI-NEXT: v_add_u16_e32 v10, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_add_u16_e32 v10, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v10, v0 +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v20f16_to_v10i32: +; GFX9-LABEL: bitcast_v20i16_to_v10i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: s_movk_i32 s6, 0x200 -; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v20f16_to_v10i32: +; GFX11-LABEL: bitcast_v20i16_to_v10i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -675,31 +1059,31 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <20 x half> %a, splat (half 0xH0200) - %a2 = bitcast <20 x half> %a1 to <10 x i32> + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <10 x i32> br label %end cmp.false: - %a3 = bitcast <20 x half> %a to <10 x i32> + %a3 = bitcast <20 x i16> %a to <10 x i32> br label %end end: @@ -707,561 +1091,9335 @@ end: ret <10 x i32> %phi } -define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i32_to_v40i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v17, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; GCN-NEXT: .LBB4_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v17, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; GCN-NEXT: .LBB4_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v48 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v38 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v37 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v34 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v20 -; GCN-NEXT: v_or_b32_e32 v35, v49, v35 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 24, v28 -; GCN-NEXT: v_or_b32_e32 v48, v50, v48 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GCN-NEXT: v_or_b32_e32 v3, v3, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 24, v22 -; GCN-NEXT: v_or_b32_e32 v4, v4, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GCN-NEXT: v_or_b32_e32 v5, v5, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v16 -; GCN-NEXT: v_or_b32_e32 v6, v6, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v7, v7, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v24 -; GCN-NEXT: v_or_b32_e32 v9, v9, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v10, v10, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v35 -; GCN-NEXT: v_or_b32_e32 v19, v32, v33 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v25, v38, v39 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v26, v26, v27 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v27, v34, v36 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v20, v20, v21 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v21, v28, v30 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v15, v22, v17 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v12, v16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v18, v19 -; GCN-NEXT: v_or_b32_e32 v16, v24, v25 -; GCN-NEXT: v_or_b32_e32 v3, v3, v26 -; GCN-NEXT: v_or_b32_e32 v4, v4, v27 -; GCN-NEXT: v_or_b32_e32 v5, v5, v20 -; GCN-NEXT: v_or_b32_e32 v6, v6, v21 -; GCN-NEXT: v_or_b32_e32 v7, v7, v14 -; GCN-NEXT: v_or_b32_e32 v8, v8, v15 -; GCN-NEXT: v_or_b32_e32 v9, v9, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v12 -; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <10 x i32> @bitcast_v20i16_to_v10i32_scalar(<20 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i16_to_v10i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mov_b32_e32 v10, v4 +; SI-NEXT: v_mov_b32_e32 v11, v2 +; SI-NEXT: v_mov_b32_e32 v12, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v5 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v9, v0, v13 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v14, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_branch .LBB7_2 ; -; VI-LABEL: bitcast_v10i32_to_v40i8: +; VI-LABEL: bitcast_v20i16_to_v10i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; VI-NEXT: ; implicit-def: $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr14 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr12 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v20i16_to_v10i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20i16_to_v10i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: s_branch .LBB7_2 +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <20 x i16> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v10i32_to_v20f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v29, v9 +; SI-NEXT: v_mov_b32_e32 v28, v8 +; SI-NEXT: v_mov_b32_e32 v27, v7 +; SI-NEXT: v_mov_b32_e32 v26, v6 +; SI-NEXT: v_mov_b32_e32 v25, v5 +; SI-NEXT: v_mov_b32_e32 v24, v4 +; SI-NEXT: v_mov_b32_e32 v23, v3 +; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: v_mov_b32_e32 v21, v1 +; SI-NEXT: v_mov_b32_e32 v20, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_4 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB8_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: .LBB8_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i32_to_v20f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB4_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 ; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 ; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB4_4: ; %end +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 -; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v27 -; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v26 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 -; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v10i32_to_v40i8: +; GFX9-LABEL: bitcast_v10i32_to_v20f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX9-NEXT: ; implicit-def: $vgpr16 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr14 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr13 -; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr27 -; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr12 -; GFX9-NEXT: ; implicit-def: $vgpr24 -; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr22 -; GFX9-NEXT: ; implicit-def: $vgpr21 -; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr18 -; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB4_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 ; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 ; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 ; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] ; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB4_4: ; %end +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 -; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 -; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 -; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i32_to_v20f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB8_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + +define inreg <20 x half> @bitcast_v10i32_to_v20f16_scalar(<10 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i32_to_v20f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v10i32_to_v20f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v10i32_to_v20f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v10i32_to_v20f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + +define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v20f16_to_v10i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: v_or_b32_e32 v2, v29, v2 +; SI-NEXT: v_or_b32_e32 v3, v27, v3 +; SI-NEXT: v_or_b32_e32 v4, v25, v4 +; SI-NEXT: v_or_b32_e32 v5, v23, v5 +; SI-NEXT: v_or_b32_e32 v6, v21, v6 +; SI-NEXT: v_or_b32_e32 v7, v14, v7 +; SI-NEXT: v_or_b32_e32 v8, v12, v8 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20f16_to_v10i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v10, 0x200 +; VI-NEXT: v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v11 +; VI-NEXT: v_add_f16_sdwa v11, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v11 +; VI-NEXT: v_add_f16_sdwa v11, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v11 +; VI-NEXT: v_add_f16_sdwa v11, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v11 +; VI-NEXT: v_add_f16_sdwa v11, v5, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v11 +; VI-NEXT: v_add_f16_sdwa v11, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v11 +; VI-NEXT: v_add_f16_sdwa v11, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v11 +; VI-NEXT: v_add_f16_sdwa v11, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v11 +; VI-NEXT: v_add_f16_sdwa v11, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v10, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v11 +; VI-NEXT: v_or_b32_e32 v0, v0, v10 +; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v10i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v10i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f16_to_v10i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v29, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 +; SI-NEXT: v_or_b32_e32 v2, v24, v2 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v4, v20, v4 +; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_or_b32_e32 v6, v16, v6 +; SI-NEXT: v_or_b32_e32 v7, v14, v7 +; SI-NEXT: v_or_b32_e32 v8, v12, v8 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v20f16_to_v10i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_mov_b32_e32 v10, s4 +; VI-NEXT: v_add_f16_sdwa v10, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v10 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v10i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v10i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v10i32_to_v40i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v32, v32, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i32_to_v40i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: .LBB12_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB12_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: .LBB12_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v27 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v26 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10i32_to_v40i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB12_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB12_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB12_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v26 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v10i32_to_v40i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-TRUE16-NEXT: .LBB12_2: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-TRUE16-NEXT: .LBB12_4: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v1.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v2.h, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v13 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v30, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v26, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v16, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v17, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v19, v10 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v10i32_to_v40i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v36, 8, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v31, 8, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v37, 8, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v48, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v35, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v30, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v38, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v33, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v25, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v20, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v15 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + +define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i32_to_v40i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v3, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s25, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s25, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s25, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s23, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s21, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s19, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s17, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s17, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 8 +; SI-NEXT: s_lshr_b32 s6, s25, 24 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 8 +; SI-NEXT: s_lshr_b32 s9, s23, 24 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 8 +; SI-NEXT: s_lshr_b32 s12, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 8 +; SI-NEXT: s_lshr_b32 s15, s19, 24 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 8 +; SI-NEXT: s_lshr_b32 s28, s17, 24 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_mov_b32_e32 v3, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s25, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s25, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s25, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s23, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s21, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s19, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s17, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s17, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 8 +; SI-NEXT: s_lshr_b32 s6, s25, 24 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 8 +; SI-NEXT: s_lshr_b32 s9, s23, 24 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 8 +; SI-NEXT: s_lshr_b32 s12, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 8 +; SI-NEXT: s_lshr_b32 s15, s19, 24 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 8 +; SI-NEXT: s_lshr_b32 s28, s17, 24 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 8 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s28, 24 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s15, s15, 24 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s15, s5 +; SI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s14, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s13, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s12, s12, 24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s5, s11, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s10, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s9, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s7, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v10i32_to_v40i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s28, s25, 8 +; VI-NEXT: s_lshr_b32 s29, s24, 16 +; VI-NEXT: s_lshr_b32 s40, s24, 8 +; VI-NEXT: s_lshr_b32 s41, s23, 24 +; VI-NEXT: s_lshr_b32 s42, s23, 16 +; VI-NEXT: s_lshr_b32 s43, s23, 8 +; VI-NEXT: s_lshr_b32 s44, s22, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s47, s21, 16 +; VI-NEXT: s_lshr_b32 s56, s21, 8 +; VI-NEXT: s_lshr_b32 s57, s20, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 8 +; VI-NEXT: s_lshr_b32 s59, s19, 24 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 8 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s18, 8 +; VI-NEXT: s_lshr_b32 s72, s17, 24 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 8 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s76, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s28, s25, 8 +; VI-NEXT: s_lshr_b32 s29, s24, 16 +; VI-NEXT: s_lshr_b32 s40, s24, 8 +; VI-NEXT: s_lshr_b32 s41, s23, 24 +; VI-NEXT: s_lshr_b32 s42, s23, 16 +; VI-NEXT: s_lshr_b32 s43, s23, 8 +; VI-NEXT: s_lshr_b32 s44, s22, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s47, s21, 16 +; VI-NEXT: s_lshr_b32 s56, s21, 8 +; VI-NEXT: s_lshr_b32 s57, s20, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 8 +; VI-NEXT: s_lshr_b32 s59, s19, 24 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 8 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s18, 8 +; VI-NEXT: s_lshr_b32 s72, s17, 24 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 8 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s76, s16, 8 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_and_b32 s5, s16, 0xff +; VI-NEXT: s_lshl_b32 s7, s76, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s75, 0xff +; VI-NEXT: s_lshl_b32 s9, s12, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_and_b32 s5, s17, 0xff +; VI-NEXT: s_lshl_b32 s7, s74, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s73, 0xff +; VI-NEXT: s_lshl_b32 s9, s72, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s7, s63, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s62, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s19, 0xff +; VI-NEXT: s_lshl_b32 s7, s61, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s60, 0xff +; VI-NEXT: s_lshl_b32 s9, s59, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s7, s58, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s57, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s21, 0xff +; VI-NEXT: s_lshl_b32 s7, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s47, 0xff +; VI-NEXT: s_lshl_b32 s8, s46, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s45, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s44, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s23, 0xff +; VI-NEXT: s_lshl_b32 s6, s43, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s42, 0xff +; VI-NEXT: s_lshl_b32 s7, s41, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_lshl_b32 s6, s40, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s29, 0xff +; VI-NEXT: s_lshl_b32 s4, s4, 8 +; VI-NEXT: s_or_b32 s4, s6, s4 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s25, 0xff +; VI-NEXT: s_lshl_b32 s5, s28, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s27, 0xff +; VI-NEXT: s_lshl_b32 s6, s26, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v10i32_to_v40i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s28, s25, 8 +; GFX9-NEXT: s_lshr_b32 s29, s24, 16 +; GFX9-NEXT: s_lshr_b32 s40, s24, 8 +; GFX9-NEXT: s_lshr_b32 s41, s23, 24 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s43, s23, 8 +; GFX9-NEXT: s_lshr_b32 s44, s22, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 8 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: s_lshr_b32 s47, s21, 16 +; GFX9-NEXT: s_lshr_b32 s56, s21, 8 +; GFX9-NEXT: s_lshr_b32 s57, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s20, 8 +; GFX9-NEXT: s_lshr_b32 s59, s19, 24 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s19, 8 +; GFX9-NEXT: s_lshr_b32 s62, s18, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 8 +; GFX9-NEXT: s_lshr_b32 s72, s17, 24 +; GFX9-NEXT: s_lshr_b32 s73, s17, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 8 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_lshr_b32 s76, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s28, s25, 8 +; GFX9-NEXT: s_lshr_b32 s29, s24, 16 +; GFX9-NEXT: s_lshr_b32 s40, s24, 8 +; GFX9-NEXT: s_lshr_b32 s41, s23, 24 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s43, s23, 8 +; GFX9-NEXT: s_lshr_b32 s44, s22, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 8 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: s_lshr_b32 s47, s21, 16 +; GFX9-NEXT: s_lshr_b32 s56, s21, 8 +; GFX9-NEXT: s_lshr_b32 s57, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s20, 8 +; GFX9-NEXT: s_lshr_b32 s59, s19, 24 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s19, 8 +; GFX9-NEXT: s_lshr_b32 s62, s18, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 8 +; GFX9-NEXT: s_lshr_b32 s72, s17, 24 +; GFX9-NEXT: s_lshr_b32 s73, s17, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 8 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_lshr_b32 s76, s16, 8 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_and_b32 s5, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s76, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s75, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s12, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s74, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s73, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s72, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s63, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s62, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s61, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s60, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s59, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s58, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s57, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s56, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s47, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s46, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s45, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s44, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s43, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s42, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s41, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s40, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_or_b32 s4, s6, s4 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_and_b32 s4, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s28, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s26, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-TRUE16-LABEL: bitcast_v10i32_to_v40i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s63 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s62 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s61 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s12 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s60 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s59 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s58 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s57 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s47 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s45 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s40 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s25 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s14 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 +; +; GFX11-FAKE16-LABEL: bitcast_v10i32_to_v40i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s63, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s63 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %end +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s62, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s61, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s12, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s60, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s59, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s58, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s57, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s56, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s10, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s47, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s46, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s45, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s44, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s43, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s42, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s41, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s40, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s19, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s27, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s24, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s4, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s22, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s15, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s14, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB13_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + +define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { +; SI-LABEL: bitcast_v40i8_to_v10i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v44 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v54, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v52, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v38 +; SI-NEXT: v_or_b32_e32 v8, v8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v15, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v55, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v54, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v52, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v5, v49, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v6, v25, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v21, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v15, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v40i8_to_v10i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v8 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v44 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v8, v39, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v37, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: .LBB14_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB14_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v31 +; VI-NEXT: v_add_u16_e32 v1, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v9, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v33 +; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v35 +; VI-NEXT: v_add_u16_e32 v3, 3, v10 +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v12 +; VI-NEXT: v_add_u16_e32 v4, 3, v14 +; VI-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v52, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_add_u16_sdwa v4, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v16 +; VI-NEXT: v_add_u16_e32 v5, 3, v18 +; VI-NEXT: v_or_b32_sdwa v4, v51, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v20 +; VI-NEXT: v_add_u16_e32 v6, 3, v22 +; VI-NEXT: v_or_b32_sdwa v5, v49, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v48, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v24 +; VI-NEXT: v_add_u16_e32 v7, 3, v26 +; VI-NEXT: v_or_b32_sdwa v6, v25, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u16_e32 v7, 3, v28 +; VI-NEXT: v_add_u16_e32 v8, 3, v30 +; VI-NEXT: v_or_b32_sdwa v7, v21, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 +; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v8, 3, v39 +; VI-NEXT: v_add_u16_e32 v10, 3, v38 +; VI-NEXT: v_or_b32_sdwa v8, v17, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v10, v15, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_add_u16_sdwa v10, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v37 +; VI-NEXT: v_add_u16_e32 v12, 3, v36 +; VI-NEXT: v_or_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: .LBB14_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v40i8_to_v10i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v32, v2 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v8 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v44 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v18, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v22, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v28, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v39, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v37, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: .LBB14_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v33 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v35 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v12 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v4, v52, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v16 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v4, v51, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v20 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v5, v49, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v6, v48, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v24 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v6, v25, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v28 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v7, v21, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v39 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v8, v17, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v9, v15, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v37 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v9, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX9-NEXT: .LBB14_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v10i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x9 +; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_4 +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v14.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v20.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v22.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v24.h, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v15.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v19.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.l, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v15.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v18.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v16.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v18.h, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v14.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v14.l, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v11.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v11.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v10i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v8 :: v_dual_mov_b32 v34, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, v4 :: v_dual_mov_b32 v32, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, v0 +; GFX11-FAKE16-NEXT: s_clause 0x9 +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v36, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v37, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v38, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v49, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v29 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_4 +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB14_3: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v8, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: .LBB14_4: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v35, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v10, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v12, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v14, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v16, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v18, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v53, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v54, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v55, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v64, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v65, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v48, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v49, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v50, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v51, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v52, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v8, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v20, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v22, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v24, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v26, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v30, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v39, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v38, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v37, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v36, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v21, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v25, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v27, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v29, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v11, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v13, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v15, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v17, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v19, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i8_to_v10i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_mov_b32_e32 v31, v8 +; SI-NEXT: v_mov_b32_e32 v30, v6 +; SI-NEXT: v_mov_b32_e32 v29, v4 +; SI-NEXT: v_mov_b32_e32 v28, v2 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v25 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v31 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v40i8_to_v10i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v30, v6 +; VI-NEXT: v_mov_b32_e32 v31, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v29, v2 +; VI-NEXT: v_mov_b32_e32 v28, v1 +; VI-NEXT: v_mov_b32_e32 v27, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; VI-NEXT: v_or_b32_sdwa v0, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v10, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 +; VI-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s19, 24 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s24, 0xff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v13 +; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_or_b32_sdwa v10, v35, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v6, v6, v12 +; VI-NEXT: v_and_b32_e32 v12, 0xff, v16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v24 +; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v20 +; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v27 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s28, 0xff +; VI-NEXT: v_or_b32_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v26, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v29 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; VI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v7, v7, v12 +; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 +; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v34 +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v31 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v28 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_or_b32_sdwa v20, v37, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_or_b32_sdwa v17, v36, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_or_b32_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v3, v3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v20 +; VI-NEXT: v_or_b32_e32 v4, v4, v19 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 +; VI-NEXT: v_or_b32_e32 v5, v5, v13 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v8, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v40i8_to_v10i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: v_mov_b32_e32 v31, v8 +; GFX9-NEXT: v_mov_b32_e32 v30, v6 +; GFX9-NEXT: v_mov_b32_e32 v29, v4 +; GFX9-NEXT: v_mov_b32_e32 v28, v2 +; GFX9-NEXT: v_mov_b32_e32 v27, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v25 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v27 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v10i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v32, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v34, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v36, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v34, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v31 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v29 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v27 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB15_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v10i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v26, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v4 :: v_dual_mov_b32 v24, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v0 :: v_dual_lshlrev_b32 v32, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v23 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v27 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v32 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v14 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v25 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v34, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v29, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v30, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v23 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v31, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v12 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v32, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v25 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v28, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v15, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v17, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v19, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v22, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: .LBB15_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v10i32_to_v5f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i32_to_v5f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB16_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10i32_to_v5f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB16_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i32_to_v5f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB16_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} + +define inreg <5 x double> @bitcast_v10i32_to_v5f64_scalar(<10 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i32_to_v5f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v10i32_to_v5f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v10i32_to_v5f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v10i32_to_v5f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} + +define <10 x i32> @bitcast_v5f64_to_v10i32(<5 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v5f64_to_v10i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v10i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB18_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v10i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB18_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v10i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB18_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define inreg <10 x i32> @bitcast_v5f64_to_v10i32_scalar(<5 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f64_to_v10i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB19_4 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_3: +; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v10i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v10i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v10i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v10i32_to_v5i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i32_to_v5i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10i32_to_v5i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i32_to_v5i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + +define inreg <5 x i64> @bitcast_v10i32_to_v5i64_scalar(<10 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i32_to_v5i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v10i32_to_v5i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v10i32_to_v5i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v10i32_to_v5i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_3 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB21_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: s_branch .LBB21_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + +define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v5i64_to_v10i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5i64_to_v10i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5i64_to_v10i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5i64_to_v10i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define inreg <10 x i32> @bitcast_v5i64_to_v10i32_scalar(<5 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i64_to_v10i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v5i64_to_v10i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v5i64_to_v10i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v5i64_to_v10i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB23_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: s_branch .LBB23_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define <20 x i16> @bitcast_v10f32_to_v20i16(<10 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v10f32_to_v20i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB24_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB24_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v10, v20 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v20i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v20i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v20i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <20 x i16> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <20 x i16> + br label %end + +end: + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi +} + +define inreg <20 x i16> @bitcast_v10f32_to_v20i16_scalar(<10 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f32_to_v20i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB25_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB25_4 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: s_branch .LBB25_2 +; SI-NEXT: .LBB25_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v20i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB25_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_4 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_3: +; VI-NEXT: s_branch .LBB25_2 +; VI-NEXT: .LBB25_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v20i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_4 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_3: +; GFX9-NEXT: s_branch .LBB25_2 +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v20i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: s_branch .LBB25_2 +; GFX11-NEXT: .LBB25_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <20 x i16> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <20 x i16> + br label %end + +end: + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi +} + +define <10 x float> @bitcast_v20i16_to_v10f32(<20 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v20i16_to_v10f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v25, v8 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_4 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB26_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_or_b32_e32 v2, v2, v29 +; SI-NEXT: v_or_b32_e32 v3, v3, v28 +; SI-NEXT: v_or_b32_e32 v4, v4, v27 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_or_b32_e32 v7, v7, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: .LBB26_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_or_b32_e32 v2, v29, v2 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_or_b32_e32 v4, v27, v4 +; SI-NEXT: v_or_b32_e32 v5, v26, v5 +; SI-NEXT: v_or_b32_e32 v6, v20, v6 +; SI-NEXT: v_or_b32_e32 v7, v15, v7 +; SI-NEXT: v_or_b32_e32 v8, v13, v8 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20i16_to_v10f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v11, 3 +; VI-NEXT: v_add_u16_e32 v10, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_u16_e32 v10, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v10, v8 +; VI-NEXT: v_add_u16_e32 v10, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v10, v7 +; VI-NEXT: v_add_u16_e32 v10, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v10, v6 +; VI-NEXT: v_add_u16_e32 v10, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v10, v5 +; VI-NEXT: v_add_u16_e32 v10, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v10, v4 +; VI-NEXT: v_add_u16_e32 v10, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v10, v3 +; VI-NEXT: v_add_u16_e32 v10, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v10, v2 +; VI-NEXT: v_add_u16_e32 v10, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_add_u16_e32 v10, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v10, v0 +; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20i16_to_v10f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20i16_to_v10f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <20 x i16> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define inreg <10 x float> @bitcast_v20i16_to_v10f32_scalar(<20 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i16_to_v10f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mov_b32_e32 v10, v4 +; SI-NEXT: v_mov_b32_e32 v11, v2 +; SI-NEXT: v_mov_b32_e32 v12, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v5 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v9, v0, v13 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v14, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v20i16_to_v10f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v20i16_to_v10f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_4 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_3: +; GFX9-NEXT: s_branch .LBB27_2 +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20i16_to_v10f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_4 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: s_branch .LBB27_2 +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <20 x i16> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v10f32_to_v20f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v29, v9 +; SI-NEXT: v_mov_b32_e32 v28, v8 +; SI-NEXT: v_mov_b32_e32 v27, v7 +; SI-NEXT: v_mov_b32_e32 v26, v6 +; SI-NEXT: v_mov_b32_e32 v25, v5 +; SI-NEXT: v_mov_b32_e32 v24, v4 +; SI-NEXT: v_mov_b32_e32 v23, v3 +; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: v_mov_b32_e32 v21, v1 +; SI-NEXT: v_mov_b32_e32 v20, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB28_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB28_4 +; SI-NEXT: .LBB28_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB28_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: .LBB28_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v20f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v20f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v20f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + +define inreg <20 x half> @bitcast_v10f32_to_v20f16_scalar(<10 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f32_to_v20f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s25, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_v10f32_to_v20f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v20f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v20f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + +define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v20f16_to_v10f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_4 +; SI-NEXT: .LBB30_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB30_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: v_or_b32_e32 v2, v29, v2 +; SI-NEXT: v_or_b32_e32 v3, v27, v3 +; SI-NEXT: v_or_b32_e32 v4, v25, v4 +; SI-NEXT: v_or_b32_e32 v5, v23, v5 +; SI-NEXT: v_or_b32_e32 v6, v21, v6 +; SI-NEXT: v_or_b32_e32 v7, v14, v7 +; SI-NEXT: v_or_b32_e32 v8, v12, v8 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: .LBB30_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20f16_to_v10f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB30_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v10, 0x200 +; VI-NEXT: v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v11 +; VI-NEXT: v_add_f16_sdwa v11, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v11 +; VI-NEXT: v_add_f16_sdwa v11, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v11 +; VI-NEXT: v_add_f16_sdwa v11, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v11 +; VI-NEXT: v_add_f16_sdwa v11, v5, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v11 +; VI-NEXT: v_add_f16_sdwa v11, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v11 +; VI-NEXT: v_add_f16_sdwa v11, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v11 +; VI-NEXT: v_add_f16_sdwa v11, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v11 +; VI-NEXT: v_add_f16_sdwa v11, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v10, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v11 +; VI-NEXT: v_or_b32_e32 v0, v0, v10 +; VI-NEXT: .LBB30_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v10f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB30_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB30_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v10f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB30_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f16_to_v10f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v29, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 +; SI-NEXT: v_or_b32_e32 v2, v24, v2 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v4, v20, v4 +; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_or_b32_e32 v6, v16, v6 +; SI-NEXT: v_or_b32_e32 v7, v14, v7 +; SI-NEXT: v_or_b32_e32 v8, v12, v8 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v20f16_to_v10f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB31_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_4 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_mov_b32_e32 v10, s4 +; VI-NEXT: v_add_f16_sdwa v10, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v10 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_3: +; VI-NEXT: s_branch .LBB31_2 +; VI-NEXT: .LBB31_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v10f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_4 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_3: +; GFX9-NEXT: s_branch .LBB31_2 +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v10f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_3: +; GFX11-NEXT: s_branch .LBB31_2 +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v10f32_to_v40i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: .LBB32_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v32, v32, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v40i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB32_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: .LBB32_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB32_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: .LBB32_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v27 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v26 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v40i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB32_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB32_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB32_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB32_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 @@ -1297,7 +10455,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v10i32_to_v40i8: +; GFX11-TRUE16-LABEL: bitcast_v10f32_to_v40i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 @@ -1324,7 +10482,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -1346,27 +10504,23 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB4_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB32_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB4_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v9, 1.0, v9 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v1, 1.0, v1 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v7, 1.0, v7 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v3, 1.0, v3 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 @@ -1380,7 +10534,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB4_4: ; %end +; GFX11-TRUE16-NEXT: .LBB32_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1481,7 +10635,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v10i32_to_v40i8: +; GFX11-FAKE16-LABEL: bitcast_v10f32_to_v40i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 @@ -1518,7 +10672,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -1550,27 +10704,23 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB4_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB32_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB4_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v9, 1.0, v9 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v1, 1.0, v1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v7, 1.0, v7 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v3, 1.0, v3 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v9 @@ -1594,7 +10744,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB4_4: ; %end +; GFX11-FAKE16-NEXT: .LBB32_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1696,12 +10846,12 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <10 x i32> %a, splat (i32 3) - %a2 = bitcast <10 x i32> %a1 to <40 x i8> + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <40 x i8> br label %end cmp.false: - %a3 = bitcast <10 x i32> %a to <40 x i8> + %a3 = bitcast <10 x float> %a to <40 x i8> br label %end end: @@ -1709,290 +10859,1508 @@ end: ret <40 x i8> %phi } -define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i8_to_v10i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v41 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v40 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v55 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v14, v14, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v18, v18, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v22, v22, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v26, v26, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v37, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v7, v13, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v9, v15, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v11, v17, v12 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v13, v19, v16 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v15, v21, v20 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v17, v23, v24 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v19, v25, v27 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: v_or_b32_e32 v4, v8, v9 -; GCN-NEXT: v_or_b32_e32 v5, v10, v11 -; GCN-NEXT: v_or_b32_e32 v6, v12, v13 -; GCN-NEXT: v_or_b32_e32 v7, v14, v15 -; GCN-NEXT: v_or_b32_e32 v8, v16, v17 -; GCN-NEXT: v_or_b32_e32 v9, v18, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: .LBB5_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v54 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_or_b32_e32 v0, v38, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v49, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v50, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v10, v51, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v14, v52, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v18, v27, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v22, v29, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v26, v53, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v37, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_or_b32_e32 v7, v13, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v15, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_or_b32_e32 v11, v17, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v14 -; GCN-NEXT: v_or_b32_e32 v13, v19, v16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v18 -; GCN-NEXT: v_or_b32_e32 v15, v21, v20 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v22 -; GCN-NEXT: v_or_b32_e32 v17, v23, v24 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x300, v26 -; GCN-NEXT: v_or_b32_e32 v19, v25, v27 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v9, v19, v18 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 -; GCN-NEXT: .LBB5_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f32_to_v40i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v3, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s25, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s25, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s25, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s23, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s21, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s19, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s17, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s17, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 8 +; SI-NEXT: s_lshr_b32 s28, s25, 24 +; SI-NEXT: s_lshr_b32 s29, s25, 16 +; SI-NEXT: s_lshr_b32 s40, s25, 8 +; SI-NEXT: s_lshr_b32 s15, s23, 24 +; SI-NEXT: s_lshr_b32 s26, s23, 16 +; SI-NEXT: s_lshr_b32 s27, s23, 8 +; SI-NEXT: s_lshr_b32 s12, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 8 +; SI-NEXT: s_lshr_b32 s9, s19, 24 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB33_4 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v31, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v34, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s24, 1.0 +; SI-NEXT: v_alignbit_b32 v1, v16, v17, 24 +; SI-NEXT: v_alignbit_b32 v2, v16, v17, 16 +; SI-NEXT: v_alignbit_b32 v3, v16, v17, 8 +; SI-NEXT: v_alignbit_b32 v4, v18, v21, 24 +; SI-NEXT: v_alignbit_b32 v5, v18, v21, 16 +; SI-NEXT: v_alignbit_b32 v6, v18, v21, 8 +; SI-NEXT: v_alignbit_b32 v7, v23, v24, 24 +; SI-NEXT: v_alignbit_b32 v8, v23, v24, 16 +; SI-NEXT: v_alignbit_b32 v9, v23, v24, 8 +; SI-NEXT: v_alignbit_b32 v10, v28, v29, 24 +; SI-NEXT: v_alignbit_b32 v11, v28, v29, 16 +; SI-NEXT: v_alignbit_b32 v12, v28, v29, 8 +; SI-NEXT: v_alignbit_b32 v13, v31, v34, 24 +; SI-NEXT: v_alignbit_b32 v14, v31, v34, 16 +; SI-NEXT: v_alignbit_b32 v15, v31, v34, 8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v18 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v18 +; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; SI-NEXT: v_lshrrev_b32_e32 v35, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v31 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v31 +; SI-NEXT: s_branch .LBB33_5 +; SI-NEXT: .LBB33_3: +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v34, s16 +; SI-NEXT: v_mov_b32_e32 v31, s17 +; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: v_mov_b32_e32 v28, s19 +; SI-NEXT: v_mov_b32_e32 v24, s20 +; SI-NEXT: v_mov_b32_e32 v23, s21 +; SI-NEXT: v_mov_b32_e32 v21, s22 +; SI-NEXT: v_mov_b32_e32 v18, s23 +; SI-NEXT: v_mov_b32_e32 v17, s24 +; SI-NEXT: v_mov_b32_e32 v16, s25 +; SI-NEXT: v_mov_b32_e32 v48, s8 +; SI-NEXT: v_mov_b32_e32 v39, s7 +; SI-NEXT: v_mov_b32_e32 v38, s6 +; SI-NEXT: v_mov_b32_e32 v37, s11 +; SI-NEXT: v_mov_b32_e32 v36, s10 +; SI-NEXT: v_mov_b32_e32 v35, s9 +; SI-NEXT: v_mov_b32_e32 v33, s14 +; SI-NEXT: v_mov_b32_e32 v32, s13 +; SI-NEXT: v_mov_b32_e32 v30, s12 +; SI-NEXT: v_mov_b32_e32 v27, s27 +; SI-NEXT: v_mov_b32_e32 v26, s26 +; SI-NEXT: v_mov_b32_e32 v25, s15 +; SI-NEXT: v_mov_b32_e32 v22, s40 +; SI-NEXT: v_mov_b32_e32 v20, s29 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: .LBB33_5: ; %end +; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v15, v34, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v48 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v38 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v35 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v27 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v25 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v40i8_to_v10i32: +; VI-LABEL: bitcast_v10f32_to_v40i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s29, s25, 8 +; VI-NEXT: s_lshr_b32 s28, s24, 16 +; VI-NEXT: s_lshr_b32 s40, s24, 8 +; VI-NEXT: s_lshr_b32 s41, s23, 24 +; VI-NEXT: s_lshr_b32 s42, s23, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 8 +; VI-NEXT: s_lshr_b32 s43, s22, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s47, s21, 16 +; VI-NEXT: s_lshr_b32 s57, s21, 8 +; VI-NEXT: s_lshr_b32 s56, s20, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 8 +; VI-NEXT: s_lshr_b32 s59, s19, 24 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s62, s19, 8 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s18, 8 +; VI-NEXT: s_lshr_b32 s72, s17, 24 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s17, 8 +; VI-NEXT: s_lshr_b32 s74, s16, 16 +; VI-NEXT: s_lshr_b32 s76, s16, 8 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v2, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s22, 1.0 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] +; VI-NEXT: v_add_f32_e64 v6, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s20, 1.0 +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[3:4] +; VI-NEXT: v_add_f32_e64 v8, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s18, 1.0 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_add_f32_e64 v10, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s16, 1.0 +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; VI-NEXT: s_branch .LBB33_5 +; VI-NEXT: .LBB33_3: +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: v_mov_b32_e32 v10, s17 +; VI-NEXT: v_mov_b32_e32 v7, s18 +; VI-NEXT: v_mov_b32_e32 v8, s19 +; VI-NEXT: v_mov_b32_e32 v5, s20 +; VI-NEXT: v_mov_b32_e32 v6, s21 +; VI-NEXT: v_mov_b32_e32 v3, s22 +; VI-NEXT: v_mov_b32_e32 v4, s23 +; VI-NEXT: v_mov_b32_e32 v1, s24 +; VI-NEXT: v_mov_b32_e32 v2, s25 +; VI-NEXT: v_mov_b32_e32 v39, s76 +; VI-NEXT: v_mov_b32_e32 v48, s74 +; VI-NEXT: v_mov_b32_e32 v38, s75 +; VI-NEXT: v_mov_b32_e32 v36, s73 +; VI-NEXT: v_mov_b32_e32 v37, s72 +; VI-NEXT: v_mov_b32_e32 v35, s63 +; VI-NEXT: v_mov_b32_e32 v34, s61 +; VI-NEXT: v_mov_b32_e32 v33, s62 +; VI-NEXT: v_mov_b32_e32 v31, s60 +; VI-NEXT: v_mov_b32_e32 v32, s59 +; VI-NEXT: v_mov_b32_e32 v30, s58 +; VI-NEXT: v_mov_b32_e32 v29, s56 +; VI-NEXT: v_mov_b32_e32 v28, s57 +; VI-NEXT: v_mov_b32_e32 v26, s47 +; VI-NEXT: v_mov_b32_e32 v27, s46 +; VI-NEXT: v_mov_b32_e32 v25, s45 +; VI-NEXT: v_mov_b32_e32 v24, s43 +; VI-NEXT: v_mov_b32_e32 v23, s44 +; VI-NEXT: v_mov_b32_e32 v21, s42 +; VI-NEXT: v_mov_b32_e32 v22, s41 +; VI-NEXT: v_mov_b32_e32 v20, s40 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: v_mov_b32_e32 v16, s27 +; VI-NEXT: v_mov_b32_e32 v17, s26 +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v13, s8 +; VI-NEXT: v_mov_b32_e32 v12, s10 +; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: .LBB33_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v38 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; VI-NEXT: v_or_b32_sdwa v10, v36, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v10, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v35 +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v14 +; VI-NEXT: v_or_b32_sdwa v9, v34, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v9, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v33 +; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v32 +; VI-NEXT: v_or_b32_sdwa v8, v31, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v8, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; VI-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v28 +; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v27 +; VI-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v6, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v25 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v12 +; VI-NEXT: v_or_b32_sdwa v5, v24, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; VI-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v18 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v40i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s29, s25, 8 +; GFX9-NEXT: s_lshr_b32 s28, s24, 16 +; GFX9-NEXT: s_lshr_b32 s40, s24, 8 +; GFX9-NEXT: s_lshr_b32 s41, s23, 24 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 8 +; GFX9-NEXT: s_lshr_b32 s43, s22, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 8 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: s_lshr_b32 s47, s21, 16 +; GFX9-NEXT: s_lshr_b32 s57, s21, 8 +; GFX9-NEXT: s_lshr_b32 s56, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s20, 8 +; GFX9-NEXT: s_lshr_b32 s59, s19, 24 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s62, s19, 8 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 8 +; GFX9-NEXT: s_lshr_b32 s72, s17, 24 +; GFX9-NEXT: s_lshr_b32 s73, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s17, 8 +; GFX9-NEXT: s_lshr_b32 s74, s16, 16 +; GFX9-NEXT: s_lshr_b32 s76, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v2, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s22, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] +; GFX9-NEXT: v_add_f32_e64 v6, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s20, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[3:4] +; GFX9-NEXT: v_add_f32_e64 v8, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s18, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_add_f32_e64 v10, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s16, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; GFX9-NEXT: s_branch .LBB33_5 +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v9, s16 +; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: v_mov_b32_e32 v7, s18 +; GFX9-NEXT: v_mov_b32_e32 v8, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s20 +; GFX9-NEXT: v_mov_b32_e32 v6, s21 +; GFX9-NEXT: v_mov_b32_e32 v3, s22 +; GFX9-NEXT: v_mov_b32_e32 v4, s23 +; GFX9-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-NEXT: v_mov_b32_e32 v2, s25 +; GFX9-NEXT: v_mov_b32_e32 v39, s76 +; GFX9-NEXT: v_mov_b32_e32 v48, s74 +; GFX9-NEXT: v_mov_b32_e32 v38, s75 +; GFX9-NEXT: v_mov_b32_e32 v36, s73 +; GFX9-NEXT: v_mov_b32_e32 v37, s72 +; GFX9-NEXT: v_mov_b32_e32 v35, s63 +; GFX9-NEXT: v_mov_b32_e32 v34, s61 +; GFX9-NEXT: v_mov_b32_e32 v33, s62 +; GFX9-NEXT: v_mov_b32_e32 v31, s60 +; GFX9-NEXT: v_mov_b32_e32 v32, s59 +; GFX9-NEXT: v_mov_b32_e32 v30, s58 +; GFX9-NEXT: v_mov_b32_e32 v29, s56 +; GFX9-NEXT: v_mov_b32_e32 v28, s57 +; GFX9-NEXT: v_mov_b32_e32 v26, s47 +; GFX9-NEXT: v_mov_b32_e32 v27, s46 +; GFX9-NEXT: v_mov_b32_e32 v25, s45 +; GFX9-NEXT: v_mov_b32_e32 v24, s43 +; GFX9-NEXT: v_mov_b32_e32 v23, s44 +; GFX9-NEXT: v_mov_b32_e32 v21, s42 +; GFX9-NEXT: v_mov_b32_e32 v22, s41 +; GFX9-NEXT: v_mov_b32_e32 v20, s40 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 +; GFX9-NEXT: v_mov_b32_e32 v16, s27 +; GFX9-NEXT: v_mov_b32_e32 v17, s26 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v13, s8 +; GFX9-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-NEXT: .LBB33_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v10, v36, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v9, v34, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v8, v31, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v28 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v5, v24, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v10f32_to_v40i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s14 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, s3, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, s2, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, s17, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s19, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, s21, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, s20, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s18, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, s16, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, s0, 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v15 +; GFX11-TRUE16-NEXT: s_branch .LBB33_5 +; GFX11-TRUE16-NEXT: .LBB33_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB33_2 +; GFX11-TRUE16-NEXT: .LBB33_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s61 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s59 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s57 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s47 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s12 +; GFX11-TRUE16-NEXT: .LBB33_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v34, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v32, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v29, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v11, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v12, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v37, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v4, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[15:18], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[1:2], off offset:32 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v10f32_to_v40i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s14 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, s17, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, s16, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, s3, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, s2, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, s19, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, s21, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, s20, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, s18, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, s0, 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 8, v13 +; GFX11-FAKE16-NEXT: s_branch .LBB33_5 +; GFX11-FAKE16-NEXT: .LBB33_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB33_2 +; GFX11-FAKE16-NEXT: .LBB33_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s2 :: v_dual_mov_b32 v10, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v4, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s20 :: v_dual_mov_b32 v2, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s63 :: v_dual_mov_b32 v39, s61 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s62 :: v_dual_mov_b32 v37, s60 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v35, s58 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s56 :: v_dual_mov_b32 v33, s57 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s47 :: v_dual_mov_b32 v31, s46 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v29, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s44 :: v_dual_mov_b32 v27, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v25, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s28 :: v_dual_mov_b32 v23, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s27 :: v_dual_mov_b32 v21, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s25 :: v_dual_mov_b32 v19, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v17, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v15, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s15 :: v_dual_mov_b32 v11, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v7, s12 +; GFX11-FAKE16-NEXT: .LBB33_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v39, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v34, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v32, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v29, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v9, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v10, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v10, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v24, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v37, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v19, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v3, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v4, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v17 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[1:2], off offset:32 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + +define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { +; SI-LABEL: bitcast_v40i8_to_v10f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v44 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v54, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v52, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v38 +; SI-NEXT: v_or_b32_e32 v8, v8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v15, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v55, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v54, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v52, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v5, v49, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v6, v25, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v21, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v15, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v40i8_to_v10f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill @@ -2045,7 +12413,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v44 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2118,9 +12486,9 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: .LBB5_2: ; %Flow +; VI-NEXT: .LBB34_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_4 +; VI-NEXT: s_cbranch_execz .LBB34_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v31 ; VI-NEXT: v_add_u16_e32 v1, 3, v32 @@ -2194,7 +12562,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 ; VI-NEXT: v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: .LBB5_4: ; %end +; VI-NEXT: .LBB34_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -2204,7 +12572,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v40i8_to_v10i32: +; GFX9-LABEL: bitcast_v40i8_to_v10f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill @@ -2257,7 +12625,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v44 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2330,9 +12698,9 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr11 -; GFX9-NEXT: .LBB5_2: ; %Flow +; GFX9-NEXT: .LBB34_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_4 +; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 @@ -2406,7 +12774,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX9-NEXT: .LBB5_4: ; %end +; GFX9-NEXT: .LBB34_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -2416,7 +12784,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v10i32: +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v10f32: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x9 @@ -2473,15 +12841,15 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB5_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB5_4 -; GFX11-TRUE16-NEXT: .LBB5_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_4 +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB5_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB34_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h @@ -2594,8 +12962,8 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB5_2 -; GFX11-TRUE16-NEXT: .LBB5_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-TRUE16-NEXT: .LBB34_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3 @@ -2710,7 +13078,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v10i32: +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v10f32: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v8 :: v_dual_mov_b32 v34, v6 @@ -2758,15 +13126,15 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB5_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB5_4 -; GFX11-FAKE16-NEXT: .LBB5_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_4 +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB5_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB34_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 @@ -2879,8 +13247,8 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB5_2 -; GFX11-FAKE16-NEXT: .LBB5_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-FAKE16-NEXT: .LBB34_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -2999,89 +13367,1214 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { cmp.true: %a1 = add <40 x i8> %a, splat (i8 3) - %a2 = bitcast <40 x i8> %a1 to <10 x i32> + %a2 = bitcast <40 x i8> %a1 to <10 x float> br label %end cmp.false: - %a3 = bitcast <40 x i8> %a to <10 x i32> + %a3 = bitcast <40 x i8> %a to <10 x float> br label %end end: - %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <10 x i32> %phi + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i8_to_v10f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_mov_b32_e32 v31, v8 +; SI-NEXT: v_mov_b32_e32 v30, v6 +; SI-NEXT: v_mov_b32_e32 v29, v4 +; SI-NEXT: v_mov_b32_e32 v28, v2 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v25 +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v31 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v40i8_to_v10f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v30, v6 +; VI-NEXT: v_mov_b32_e32 v31, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v29, v2 +; VI-NEXT: v_mov_b32_e32 v28, v1 +; VI-NEXT: v_mov_b32_e32 v27, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; VI-NEXT: v_or_b32_sdwa v0, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v10, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 +; VI-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s19, 24 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s24, 0xff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v13 +; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_or_b32_sdwa v10, v35, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v6, v6, v12 +; VI-NEXT: v_and_b32_e32 v12, 0xff, v16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v24 +; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v20 +; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v27 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s28, 0xff +; VI-NEXT: v_or_b32_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v26, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v29 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; VI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v7, v7, v12 +; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 +; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v34 +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v31 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v28 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_or_b32_sdwa v20, v37, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_or_b32_sdwa v17, v36, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_or_b32_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v3, v3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v20 +; VI-NEXT: v_or_b32_e32 v4, v4, v19 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 +; VI-NEXT: v_or_b32_e32 v5, v5, v13 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v8, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v40i8_to_v10f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: v_mov_b32_e32 v31, v8 +; GFX9-NEXT: v_mov_b32_e32 v30, v6 +; GFX9-NEXT: v_mov_b32_e32 v29, v4 +; GFX9-NEXT: v_mov_b32_e32 v28, v2 +; GFX9-NEXT: v_mov_b32_e32 v27, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v25 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v27 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v10f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v32, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v34, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v36, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v34, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v31 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v29 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v27 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB35_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB35_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v10f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v26, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v4 :: v_dual_mov_b32 v24, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v0 :: v_dual_lshlrev_b32 v32, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v23 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v27 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v32 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v14 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v25 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v34, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v29, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v30, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v23 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v31, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v12 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v32, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v25 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v28, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v15, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v17, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v19, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v22, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: .LBB35_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB35_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-FAKE16-NEXT: s_branch .LBB35_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi } -define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i32_to_v5f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB6_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define <5 x double> @bitcast_v10f32_to_v5f64(<10 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v10f32_to_v5f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v10i32_to_v5f64: +; VI-LABEL: bitcast_v10f32_to_v5f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB36_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB36_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v10i32_to_v5f64: +; GFX9-LABEL: bitcast_v10f32_to_v5f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB36_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 -; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB6_2: ; %end +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB36_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v10i32_to_v5f64: +; GFX11-LABEL: bitcast_v10f32_to_v5f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -3089,31 +14582,25 @@ define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB6_2: ; %end +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <10 x i32> %a, splat (i32 3) - %a2 = bitcast <10 x i32> %a1 to <5 x double> + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <5 x double> br label %end cmp.false: - %a3 = bitcast <10 x i32> %a to <5 x double> + %a3 = bitcast <10 x float> %a to <5 x double> br label %end end: @@ -3121,62 +14608,237 @@ end: ret <5 x double> %phi } -define <10 x i32> @bitcast_v5f64_to_v10i32(<5 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v5f64_to_v10i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <5 x double> @bitcast_v10f32_to_v5f64_scalar(<10 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f32_to_v5f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB37_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_4 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_3: +; SI-NEXT: s_branch .LBB37_2 +; SI-NEXT: .LBB37_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v5f64_to_v10i32: +; VI-LABEL: bitcast_v10f32_to_v5f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB37_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_4 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_3: +; VI-NEXT: s_branch .LBB37_2 +; VI-NEXT: .LBB37_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v5f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_4 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_3: +; GFX9-NEXT: s_branch .LBB37_2 +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v5f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: s_branch .LBB37_2 +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} + +define <10 x float> @bitcast_v5f64_to_v10f32(<5 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v5f64_to_v10f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v10f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v5f64_to_v10i32: +; GFX9-LABEL: bitcast_v5f64_to_v10f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB7_2: ; %end +; GFX9-NEXT: .LBB38_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v5f64_to_v10i32: +; GFX11-LABEL: bitcast_v5f64_to_v10f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -3184,14 +14846,14 @@ define <10 x i32> @bitcast_v5f64_to_v10i32(<5 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3199,89 +14861,223 @@ define <10 x i32> @bitcast_v5f64_to_v10i32(<5 x double> %a, i32 %b) { cmp.true: %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <5 x double> %a1 to <10 x i32> + %a2 = bitcast <5 x double> %a1 to <10 x float> br label %end cmp.false: - %a3 = bitcast <5 x double> %a to <10 x i32> + %a3 = bitcast <5 x double> %a to <10 x float> br label %end end: - %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <10 x i32> %phi + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi } -define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i32_to_v5i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <10 x float> @bitcast_v5f64_to_v10f32_scalar(<5 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f64_to_v10f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB39_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_4 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_3: +; SI-NEXT: s_branch .LBB39_2 +; SI-NEXT: .LBB39_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v10i32_to_v5i64: +; VI-LABEL: bitcast_v5f64_to_v10f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB39_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_4 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_3: +; VI-NEXT: s_branch .LBB39_2 +; VI-NEXT: .LBB39_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v10f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_4 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_3: +; GFX9-NEXT: s_branch .LBB39_2 +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v10f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: s_branch .LBB39_2 +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define <5 x i64> @bitcast_v10f32_to_v5i64(<10 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v10f32_to_v5i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB40_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v5i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB40_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v10i32_to_v5i64: +; GFX9-LABEL: bitcast_v10f32_to_v5i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 -; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB8_2: ; %end +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB40_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v10i32_to_v5i64: +; GFX11-LABEL: bitcast_v10f32_to_v5i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -3289,31 +15085,200 @@ define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB8_2: ; %end +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <10 x i32> %a, splat (i32 3) - %a2 = bitcast <10 x i32> %a1 to <5 x i64> + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + +define inreg <5 x i64> @bitcast_v10f32_to_v5i64_scalar(<10 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f32_to_v5i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB41_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB41_4 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_3: +; SI-NEXT: s_branch .LBB41_2 +; SI-NEXT: .LBB41_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v5i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB41_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_4 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_3: +; VI-NEXT: s_branch .LBB41_2 +; VI-NEXT: .LBB41_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v5i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_4 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_3: +; GFX9-NEXT: s_branch .LBB41_2 +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v5i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_4 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_3: +; GFX11-NEXT: s_branch .LBB41_2 +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <5 x i64> br label %end cmp.false: - %a3 = bitcast <10 x i32> %a to <5 x i64> + %a3 = bitcast <10 x float> %a to <5 x i64> br label %end end: @@ -3321,38 +15286,38 @@ end: ret <5 x i64> %phi } -define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v5i64_to_v10i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v5i64_to_v10f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v5i64_to_v10i32: +; VI-LABEL: bitcast_v5i64_to_v10f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc @@ -3364,18 +15329,18 @@ define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB42_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v5i64_to_v10i32: +; GFX9-LABEL: bitcast_v5i64_to_v10f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc @@ -3387,11 +15352,11 @@ define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: .LBB42_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v5i64_to_v10i32: +; GFX11-LABEL: bitcast_v5i64_to_v10f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -3399,7 +15364,7 @@ define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -3414,7 +15379,7 @@ define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: .LBB42_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3422,196 +15387,374 @@ define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) { cmp.true: %a1 = add <5 x i64> %a, splat (i64 3) - %a2 = bitcast <5 x i64> %a1 to <10 x i32> + %a2 = bitcast <5 x i64> %a1 to <10 x float> br label %end cmp.false: - %a3 = bitcast <5 x i64> %a to <10 x i32> + %a3 = bitcast <5 x i64> %a to <10 x float> br label %end end: - %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <10 x i32> %phi + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi } -define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f32_to_v20f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v29, v9 -; GCN-NEXT: v_mov_b32_e32 v28, v8 -; GCN-NEXT: v_mov_b32_e32 v27, v7 -; GCN-NEXT: v_mov_b32_e32 v26, v6 -; GCN-NEXT: v_mov_b32_e32 v25, v5 -; GCN-NEXT: v_mov_b32_e32 v24, v4 -; GCN-NEXT: v_mov_b32_e32 v23, v3 -; GCN-NEXT: v_mov_b32_e32 v22, v2 -; GCN-NEXT: v_mov_b32_e32 v21, v1 -; GCN-NEXT: v_mov_b32_e32 v20, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <10 x float> @bitcast_v5i64_to_v10f32_scalar(<5 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i64_to_v10f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: s_branch .LBB43_2 ; -; VI-LABEL: bitcast_v10f32_to_v20f16: +; VI-LABEL: bitcast_v5i64_to_v10f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v5i64_to_v10f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-LABEL: bitcast_v5i64_to_v10f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB43_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: s_branch .LBB43_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v20i16_to_v20f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v19 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v37, v17 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v35, v15 +; SI-NEXT: v_mov_b32_e32 v34, v14 +; SI-NEXT: v_mov_b32_e32 v33, v13 +; SI-NEXT: v_mov_b32_e32 v32, v12 +; SI-NEXT: v_mov_b32_e32 v31, v11 +; SI-NEXT: v_mov_b32_e32 v30, v10 +; SI-NEXT: v_mov_b32_e32 v29, v9 +; SI-NEXT: v_mov_b32_e32 v28, v8 +; SI-NEXT: v_mov_b32_e32 v27, v7 +; SI-NEXT: v_mov_b32_e32 v26, v6 +; SI-NEXT: v_mov_b32_e32 v25, v5 +; SI-NEXT: v_mov_b32_e32 v24, v4 +; SI-NEXT: v_mov_b32_e32 v23, v3 +; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: v_mov_b32_e32 v21, v1 +; SI-NEXT: v_mov_b32_e32 v48, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_4 +; SI-NEXT: .LBB44_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB44_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: .LBB44_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20i16_to_v20f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: v_mov_b32_e32 v10, 3 +; VI-NEXT: v_add_u16_sdwa v11, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v12, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v13, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v14, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v15, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v16, v5, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v17, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v18, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v19, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v10, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v9, 3, v9 +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_or_b32_e32 v8, v8, v19 +; VI-NEXT: v_or_b32_e32 v7, v7, v18 +; VI-NEXT: v_or_b32_e32 v6, v6, v17 +; VI-NEXT: v_or_b32_e32 v5, v5, v16 +; VI-NEXT: v_or_b32_e32 v4, v4, v15 +; VI-NEXT: v_or_b32_e32 v3, v3, v14 +; VI-NEXT: v_or_b32_e32 v2, v2, v13 +; VI-NEXT: v_or_b32_e32 v1, v1, v12 +; VI-NEXT: v_or_b32_e32 v0, v0, v11 +; VI-NEXT: .LBB44_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v10f32_to_v20f16: +; GFX9-LABEL: bitcast_v20i16_to_v20f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB44_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v10f32_to_v20f16: +; GFX11-LABEL: bitcast_v20i16_to_v20f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -3619,25 +15762,31 @@ define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB44_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <10 x float> %a1 to <20 x half> + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <20 x half> br label %end cmp.false: - %a3 = bitcast <10 x float> %a to <20 x half> + %a3 = bitcast <20 x i16> %a to <20 x half> br label %end end: @@ -3645,220 +15794,445 @@ end: ret <20 x half> %phi } -define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f16_to_v10f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v18 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_4 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB11_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v13 -; GCN-NEXT: v_or_b32_e32 v0, v29, v0 -; GCN-NEXT: v_or_b32_e32 v1, v27, v1 -; GCN-NEXT: v_or_b32_e32 v2, v25, v2 -; GCN-NEXT: v_or_b32_e32 v3, v23, v3 -; GCN-NEXT: v_or_b32_e32 v4, v22, v4 -; GCN-NEXT: v_or_b32_e32 v5, v21, v5 -; GCN-NEXT: v_or_b32_e32 v6, v20, v6 -; GCN-NEXT: v_or_b32_e32 v7, v12, v7 -; GCN-NEXT: v_or_b32_e32 v8, v11, v8 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: .LBB11_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v17, v16 -; GCN-NEXT: v_or_b32_e32 v6, v19, v18 -; GCN-NEXT: v_or_b32_e32 v7, v12, v15 -; GCN-NEXT: v_or_b32_e32 v8, v11, v14 -; GCN-NEXT: v_or_b32_e32 v9, v10, v13 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <20 x half> @bitcast_v20i16_to_v20f16_scalar(<20 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i16_to_v20f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mov_b32_e32 v22, v5 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_mov_b32_e32 v25, v2 +; SI-NEXT: v_mov_b32_e32 v24, v1 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v23 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_branch .LBB45_2 ; -; VI-LABEL: bitcast_v20f16_to_v10f32: +; VI-LABEL: bitcast_v20i16_to_v20f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v20i16_to_v20f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_4 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_3: +; GFX9-NEXT: s_branch .LBB45_2 +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20i16_to_v20f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_4 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_3: +; GFX11-NEXT: s_branch .LBB45_2 +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <20 x i16> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + +define <20 x i16> @bitcast_v20f16_to_v20i16(<20 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v20f16_to_v20i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20f16_to_v20i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v10, 0x200 -; VI-NEXT: v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v11, 0x200 +; VI-NEXT: v_add_f16_e32 v10, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v6, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v8, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v11, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 ; VI-NEXT: v_or_b32_e32 v9, v9, v11 -; VI-NEXT: v_add_f16_sdwa v11, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v11 -; VI-NEXT: v_add_f16_sdwa v11, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v11 -; VI-NEXT: v_add_f16_sdwa v11, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 -; VI-NEXT: v_or_b32_e32 v6, v6, v11 -; VI-NEXT: v_add_f16_sdwa v11, v5, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v11 -; VI-NEXT: v_add_f16_sdwa v11, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 -; VI-NEXT: v_or_b32_e32 v4, v4, v11 -; VI-NEXT: v_add_f16_sdwa v11, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v11 -; VI-NEXT: v_add_f16_sdwa v11, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_or_b32_e32 v2, v2, v11 -; VI-NEXT: v_add_f16_sdwa v11, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: v_add_f16_sdwa v10, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 -; VI-NEXT: v_or_b32_e32 v1, v1, v11 -; VI-NEXT: v_or_b32_e32 v0, v0, v10 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: v_or_b32_e32 v8, v19, v8 +; VI-NEXT: v_or_b32_e32 v7, v18, v7 +; VI-NEXT: v_or_b32_e32 v6, v17, v6 +; VI-NEXT: v_or_b32_e32 v5, v16, v5 +; VI-NEXT: v_or_b32_e32 v4, v15, v4 +; VI-NEXT: v_or_b32_e32 v3, v14, v3 +; VI-NEXT: v_or_b32_e32 v2, v13, v2 +; VI-NEXT: v_or_b32_e32 v1, v12, v1 +; VI-NEXT: v_or_b32_e32 v0, v10, v0 +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v20f16_to_v10f32: +; GFX9-LABEL: bitcast_v20f16_to_v20i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] @@ -3871,11 +16245,11 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB46_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v20f16_to_v10f32: +; GFX11-LABEL: bitcast_v20f16_to_v20i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -3883,7 +16257,7 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] @@ -3895,7 +16269,7 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB46_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3903,432 +16277,927 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { cmp.true: %a1 = fadd <20 x half> %a, splat (half 0xH0200) - %a2 = bitcast <20 x half> %a1 to <10 x float> + %a2 = bitcast <20 x half> %a1 to <20 x i16> br label %end cmp.false: - %a3 = bitcast <20 x half> %a to <10 x float> + %a3 = bitcast <20 x half> %a to <20 x i16> br label %end end: - %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <10 x float> %phi + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi } -define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f32_to_v40i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v17, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; GCN-NEXT: .LBB12_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v17, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; GCN-NEXT: .LBB12_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v48 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v38 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v37 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v34 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v20 -; GCN-NEXT: v_or_b32_e32 v35, v49, v35 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 24, v28 -; GCN-NEXT: v_or_b32_e32 v48, v50, v48 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GCN-NEXT: v_or_b32_e32 v3, v3, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 24, v22 -; GCN-NEXT: v_or_b32_e32 v4, v4, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GCN-NEXT: v_or_b32_e32 v5, v5, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v16 -; GCN-NEXT: v_or_b32_e32 v6, v6, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v7, v7, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v24 -; GCN-NEXT: v_or_b32_e32 v9, v9, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v10, v10, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v35 -; GCN-NEXT: v_or_b32_e32 v19, v32, v33 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v25, v38, v39 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v26, v26, v27 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v27, v34, v36 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v20, v20, v21 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v21, v28, v30 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v15, v22, v17 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v12, v16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v18, v19 -; GCN-NEXT: v_or_b32_e32 v16, v24, v25 -; GCN-NEXT: v_or_b32_e32 v3, v3, v26 -; GCN-NEXT: v_or_b32_e32 v4, v4, v27 -; GCN-NEXT: v_or_b32_e32 v5, v5, v20 -; GCN-NEXT: v_or_b32_e32 v6, v6, v21 -; GCN-NEXT: v_or_b32_e32 v7, v7, v14 -; GCN-NEXT: v_or_b32_e32 v8, v8, v15 -; GCN-NEXT: v_or_b32_e32 v9, v9, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v12 -; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f16_to_v20i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: s_branch .LBB47_2 ; -; VI-LABEL: bitcast_v10f32_to_v40i8: +; VI-LABEL: bitcast_v20f16_to_v20i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s5, s24, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s5, s25, 16 +; VI-NEXT: v_add_f16_e32 v1, s24, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_lshr_b32 s5, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s23, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s22, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s21, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v3, s25, v0 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s20, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s19, 16 +; VI-NEXT: v_or_b32_e32 v9, v3, v4 +; VI-NEXT: v_or_b32_e32 v4, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s19, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_add_f16_e32 v1, s18, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v1, v2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_add_f16_sdwa v11, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v10, s16, v0 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s17, v0 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v10, v11 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v20i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v20i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_3: +; GFX11-NEXT: s_branch .LBB47_2 +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <20 x i16> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <20 x i16> + br label %end + +end: + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi +} + +define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v20i16_to_v40i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v20 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v30, v1, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v31, v1, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v23, v1, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v22, v1, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v21, v1, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v18, v1, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v14, v1, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v10, v1, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v6, v1, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v2, v1, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v4, 8, 8 +; SI-NEXT: v_alignbit_b32 v39, v31, v30, 24 +; SI-NEXT: v_alignbit_b32 v48, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v52, v31, v30, 8 +; SI-NEXT: v_alignbit_b32 v36, v22, v23, 24 +; SI-NEXT: v_alignbit_b32 v37, v22, v23, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v23, 8 +; SI-NEXT: v_alignbit_b32 v33, v18, v21, 24 +; SI-NEXT: v_alignbit_b32 v34, v18, v21, 16 +; SI-NEXT: v_alignbit_b32 v38, v18, v21, 8 +; SI-NEXT: v_alignbit_b32 v28, v10, v14, 24 +; SI-NEXT: v_alignbit_b32 v29, v10, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v10, v14, 8 +; SI-NEXT: v_alignbit_b32 v26, v2, v6, 24 +; SI-NEXT: v_alignbit_b32 v27, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v2, v6, 8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v31 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v2 +; SI-NEXT: v_and_b32_e32 v45, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v42, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v55, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v51, 0xffff, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v46, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v43, v12, 8, 8 +; SI-NEXT: v_bfe_u32 v40, v16, 8, 8 +; SI-NEXT: v_bfe_u32 v53, v20, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v4, v63, v4 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v60, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v25, v2 +; SI-NEXT: v_or_b32_e32 v4, v59, v4 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v24, v2 +; SI-NEXT: v_or_b32_e32 v4, v58, v4 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v1 +; SI-NEXT: v_alignbit_b32 v39, v31, v30, 24 +; SI-NEXT: v_alignbit_b32 v48, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v52, v31, v30, 8 +; SI-NEXT: v_alignbit_b32 v36, v22, v23, 24 +; SI-NEXT: v_alignbit_b32 v37, v22, v23, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v23, 8 +; SI-NEXT: v_alignbit_b32 v33, v18, v21, 24 +; SI-NEXT: v_alignbit_b32 v34, v18, v21, 16 +; SI-NEXT: v_alignbit_b32 v38, v18, v21, 8 +; SI-NEXT: v_alignbit_b32 v28, v10, v14, 24 +; SI-NEXT: v_alignbit_b32 v29, v10, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v10, v14, 8 +; SI-NEXT: v_alignbit_b32 v26, v2, v6, 24 +; SI-NEXT: v_alignbit_b32 v27, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v2, v6, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v31 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v31 +; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v18 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v53 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20i16_to_v40i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; VI-NEXT: ; implicit-def: $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr14 ; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr14 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr12 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] ; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v29, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v52, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB12_2: ; %Flow +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v55, v7 +; VI-NEXT: v_mov_b32_e32 v53, v8 +; VI-NEXT: v_mov_b32_e32 v43, v9 +; VI-NEXT: v_mov_b32_e32 v42, v10 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_mov_b32_e32 v11, 3 +; VI-NEXT: v_add_u16_sdwa v17, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v20, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v18, v8, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v22, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v42, 3, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v17 +; VI-NEXT: v_add_u16_e32 v43, 3, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; VI-NEXT: v_add_u16_sdwa v19, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v24, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v53, 3, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v18 +; VI-NEXT: v_add_u16_e32 v55, 3, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; VI-NEXT: v_or_b32_e32 v10, v42, v10 +; VI-NEXT: v_or_b32_e32 v9, v43, v9 +; VI-NEXT: v_add_u16_sdwa v23, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v26, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v25, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v49, 3, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_add_u16_e32 v50, 3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 +; VI-NEXT: v_or_b32_e32 v8, v53, v8 +; VI-NEXT: v_or_b32_e32 v7, v55, v7 ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_u16_e32 v37, 3, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; VI-NEXT: v_add_u16_e32 v38, 3, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; VI-NEXT: v_or_b32_e32 v6, v49, v6 +; VI-NEXT: v_or_b32_e32 v5, v50, v5 ; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_u16_e32 v32, 3, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; VI-NEXT: v_add_u16_e32 v34, 3, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; VI-NEXT: v_or_b32_e32 v4, v37, v4 +; VI-NEXT: v_or_b32_e32 v3, v38, v3 ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_or_b32_e32 v2, v32, v2 +; VI-NEXT: v_or_b32_e32 v1, v34, v1 ; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB12_4: ; %end +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v1 +; VI-NEXT: v_bfe_u32 v29, v17, 8, 8 +; VI-NEXT: v_bfe_u32 v33, v18, 8, 8 +; VI-NEXT: v_bfe_u32 v39, v19, 8, 8 +; VI-NEXT: v_bfe_u32 v52, v21, 8, 8 +; VI-NEXT: v_bfe_u32 v41, v23, 8, 8 +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v15 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v41 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v51 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 -; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v52 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v27 -; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v39 +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v26 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 -; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v30 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v33 +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29 +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v10f32_to_v40i8: +; GFX9-LABEL: bitcast_v20i16_to_v40i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 @@ -4364,7 +17233,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -4396,23 +17265,23 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB12_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 @@ -4440,7 +17309,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB12_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 @@ -4505,7 +17374,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v10f32_to_v40i8: +; GFX11-TRUE16-LABEL: bitcast_v20i16_to_v40i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 @@ -4532,7 +17401,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -4554,23 +17423,27 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB12_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v9, 1.0, v9 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v1, 1.0, v1 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v7, 1.0, v7 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v3, 1.0, v3 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 @@ -4584,7 +17457,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB12_4: ; %end +; GFX11-TRUE16-NEXT: .LBB48_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -4685,7 +17558,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v10f32_to_v40i8: +; GFX11-FAKE16-LABEL: bitcast_v20i16_to_v40i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 @@ -4722,7 +17595,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -4754,23 +17627,27 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v9, 1.0, v9 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v1, 1.0, v1 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v7, 1.0, v7 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v3, 1.0, v3 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v9 @@ -4794,7 +17671,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB12_4: ; %end +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -4872,36 +17749,1354 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v15 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <20 x i16> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + +define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i16_to_v40i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s72, v6 +; SI-NEXT: v_readfirstlane_b32 s73, v5 +; SI-NEXT: v_readfirstlane_b32 s62, v2 +; SI-NEXT: v_readfirstlane_b32 s63, v1 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s14, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: v_alignbit_b32 v7, s14, v1, 24 +; SI-NEXT: v_alignbit_b32 v12, s14, v1, 16 +; SI-NEXT: v_alignbit_b32 v16, s14, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: v_alignbit_b32 v8, s12, v1, 24 +; SI-NEXT: v_alignbit_b32 v13, s12, v1, 16 +; SI-NEXT: v_alignbit_b32 v17, s12, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s63, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: v_alignbit_b32 v6, s10, v1, 24 +; SI-NEXT: v_alignbit_b32 v11, s10, v1, 16 +; SI-NEXT: v_alignbit_b32 v15, s10, v1, 8 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_alignbit_b32 v5, s8, v1, 24 +; SI-NEXT: v_alignbit_b32 v9, s8, v1, 16 +; SI-NEXT: v_alignbit_b32 v14, s8, v1, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: s_and_b32 s4, s73, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: v_or_b32_e32 v1, v1, v18 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: v_alignbit_b32 v2, s6, v1, 24 +; SI-NEXT: v_alignbit_b32 v4, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v10, s6, v1, 8 +; SI-NEXT: s_lshr_b32 s59, s14, 8 +; SI-NEXT: s_lshr_b32 s56, s12, 8 +; SI-NEXT: s_lshr_b32 s45, s10, 8 +; SI-NEXT: s_lshr_b32 s42, s8, 8 +; SI-NEXT: s_lshr_b32 s15, s6, 8 +; SI-NEXT: s_and_b32 s60, s19, 0xffff +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_and_b32 s46, s27, 0xffff +; SI-NEXT: s_and_b32 s43, s62, 0xffff +; SI-NEXT: s_and_b32 s40, s72, 0xffff +; SI-NEXT: s_bfe_u32 s61, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s58, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s47, s27, 0x80008 +; SI-NEXT: s_bfe_u32 s44, s62, 0x80008 +; SI-NEXT: s_bfe_u32 s41, s72, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: s_add_i32 s73, s73, 3 +; SI-NEXT: s_and_b32 s4, s73, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s63, s63, 3 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s63, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s8, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s9, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s10, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s11, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s12, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s13, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v3 +; SI-NEXT: s_add_i32 s14, s4, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_alignbit_b32 v7, s14, v2, 24 +; SI-NEXT: v_alignbit_b32 v12, s14, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, s14, v2, 8 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_alignbit_b32 v8, s12, v2, 24 +; SI-NEXT: v_alignbit_b32 v13, s12, v2, 16 +; SI-NEXT: v_alignbit_b32 v17, s12, v2, 8 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_alignbit_b32 v6, s10, v2, 24 +; SI-NEXT: v_alignbit_b32 v11, s10, v2, 16 +; SI-NEXT: v_alignbit_b32 v15, s10, v2, 8 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_alignbit_b32 v5, s8, v2, 24 +; SI-NEXT: v_alignbit_b32 v9, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v14, s8, v2, 8 +; SI-NEXT: v_alignbit_b32 v2, v3, v1, 24 +; SI-NEXT: v_alignbit_b32 v4, v3, v1, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v1, 8 +; SI-NEXT: s_lshr_b32 s61, s14, 24 +; SI-NEXT: s_lshr_b32 s60, s14, 16 +; SI-NEXT: s_lshr_b32 s59, s14, 8 +; SI-NEXT: s_lshr_b32 s58, s12, 24 +; SI-NEXT: s_lshr_b32 s57, s12, 16 +; SI-NEXT: s_lshr_b32 s56, s12, 8 +; SI-NEXT: s_lshr_b32 s47, s10, 24 +; SI-NEXT: s_lshr_b32 s46, s10, 16 +; SI-NEXT: s_lshr_b32 s45, s10, 8 +; SI-NEXT: s_lshr_b32 s44, s8, 24 +; SI-NEXT: s_lshr_b32 s43, s8, 16 +; SI-NEXT: s_lshr_b32 s42, s8, 8 +; SI-NEXT: s_lshr_b32 s41, s6, 24 +; SI-NEXT: s_lshr_b32 s40, s6, 16 +; SI-NEXT: s_lshr_b32 s15, s6, 8 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s5, s59, 8 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s13, s61, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s13, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s56, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v13 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s57, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s11, s58, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s11, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v15 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s45, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v11 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s46, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s9, s47, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s9, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v14 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v9 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s43, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s44, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v10 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s15, 8 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s40, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s41, 24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v20i16_to_v40i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s28, s25, 8 +; VI-NEXT: s_lshr_b32 s29, s24, 16 +; VI-NEXT: s_lshr_b32 s40, s24, 8 +; VI-NEXT: s_lshr_b32 s41, s23, 24 +; VI-NEXT: s_lshr_b32 s42, s23, 16 +; VI-NEXT: s_lshr_b32 s43, s23, 8 +; VI-NEXT: s_lshr_b32 s44, s22, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s47, s21, 16 +; VI-NEXT: s_lshr_b32 s56, s21, 8 +; VI-NEXT: s_lshr_b32 s57, s20, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 8 +; VI-NEXT: s_lshr_b32 s59, s19, 24 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 8 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s18, 8 +; VI-NEXT: s_lshr_b32 s72, s17, 24 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 8 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s76, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s7, s25, 3 +; VI-NEXT: s_and_b32 s8, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s22, 3 +; VI-NEXT: s_and_b32 s10, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s23, 3 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s20, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s21, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s22, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s23, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s17, s23, s17 +; VI-NEXT: s_or_b32 s16, s22, s16 +; VI-NEXT: s_or_b32 s19, s21, s19 +; VI-NEXT: s_or_b32 s18, s20, s18 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s23, s10, 0x30000 +; VI-NEXT: s_add_i32 s22, s8, 0x30000 +; VI-NEXT: s_add_i32 s25, s6, 0x30000 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s28, s25, 8 +; VI-NEXT: s_lshr_b32 s29, s24, 16 +; VI-NEXT: s_lshr_b32 s40, s24, 8 +; VI-NEXT: s_lshr_b32 s41, s23, 24 +; VI-NEXT: s_lshr_b32 s42, s23, 16 +; VI-NEXT: s_lshr_b32 s43, s23, 8 +; VI-NEXT: s_lshr_b32 s44, s22, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s47, s21, 16 +; VI-NEXT: s_lshr_b32 s56, s21, 8 +; VI-NEXT: s_lshr_b32 s57, s20, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 8 +; VI-NEXT: s_lshr_b32 s59, s19, 24 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 8 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s18, 8 +; VI-NEXT: s_lshr_b32 s72, s17, 24 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 8 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s76, s16, 8 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: s_and_b32 s5, s16, 0xff +; VI-NEXT: s_lshl_b32 s7, s76, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s75, 0xff +; VI-NEXT: s_lshl_b32 s9, s12, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_and_b32 s5, s17, 0xff +; VI-NEXT: s_lshl_b32 s7, s74, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s73, 0xff +; VI-NEXT: s_lshl_b32 s9, s72, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s7, s63, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s62, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s19, 0xff +; VI-NEXT: s_lshl_b32 s7, s61, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s60, 0xff +; VI-NEXT: s_lshl_b32 s9, s59, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s7, s58, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s57, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s21, 0xff +; VI-NEXT: s_lshl_b32 s7, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s47, 0xff +; VI-NEXT: s_lshl_b32 s8, s46, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s45, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s44, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s23, 0xff +; VI-NEXT: s_lshl_b32 s6, s43, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s42, 0xff +; VI-NEXT: s_lshl_b32 s7, s41, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_lshl_b32 s6, s40, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s29, 0xff +; VI-NEXT: s_lshl_b32 s4, s4, 8 +; VI-NEXT: s_or_b32 s4, s6, s4 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s25, 0xff +; VI-NEXT: s_lshl_b32 s5, s28, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s27, 0xff +; VI-NEXT: s_lshl_b32 s6, s26, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v20i16_to_v40i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s29, s25, 8 +; GFX9-NEXT: s_lshr_b32 s28, s24, 16 +; GFX9-NEXT: s_lshr_b32 s40, s24, 8 +; GFX9-NEXT: s_lshr_b32 s41, s23, 24 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 8 +; GFX9-NEXT: s_lshr_b32 s43, s22, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 8 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: s_lshr_b32 s47, s21, 16 +; GFX9-NEXT: s_lshr_b32 s57, s21, 8 +; GFX9-NEXT: s_lshr_b32 s56, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s20, 8 +; GFX9-NEXT: s_lshr_b32 s59, s19, 24 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s62, s19, 8 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 8 +; GFX9-NEXT: s_lshr_b32 s72, s17, 24 +; GFX9-NEXT: s_lshr_b32 s73, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s17, 8 +; GFX9-NEXT: s_lshr_b32 s74, s16, 16 +; GFX9-NEXT: s_lshr_b32 s76, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v2, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] +; GFX9-NEXT: v_pk_add_u16 v6, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[3:4] +; GFX9-NEXT: v_pk_add_u16 v8, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_pk_add_u16 v10, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; GFX9-NEXT: s_branch .LBB49_5 +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v9, s16 +; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: v_mov_b32_e32 v7, s18 +; GFX9-NEXT: v_mov_b32_e32 v8, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s20 +; GFX9-NEXT: v_mov_b32_e32 v6, s21 +; GFX9-NEXT: v_mov_b32_e32 v3, s22 +; GFX9-NEXT: v_mov_b32_e32 v4, s23 +; GFX9-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-NEXT: v_mov_b32_e32 v2, s25 +; GFX9-NEXT: v_mov_b32_e32 v39, s76 +; GFX9-NEXT: v_mov_b32_e32 v48, s74 +; GFX9-NEXT: v_mov_b32_e32 v38, s75 +; GFX9-NEXT: v_mov_b32_e32 v36, s73 +; GFX9-NEXT: v_mov_b32_e32 v37, s72 +; GFX9-NEXT: v_mov_b32_e32 v35, s63 +; GFX9-NEXT: v_mov_b32_e32 v34, s61 +; GFX9-NEXT: v_mov_b32_e32 v33, s62 +; GFX9-NEXT: v_mov_b32_e32 v31, s60 +; GFX9-NEXT: v_mov_b32_e32 v32, s59 +; GFX9-NEXT: v_mov_b32_e32 v30, s58 +; GFX9-NEXT: v_mov_b32_e32 v29, s56 +; GFX9-NEXT: v_mov_b32_e32 v28, s57 +; GFX9-NEXT: v_mov_b32_e32 v26, s47 +; GFX9-NEXT: v_mov_b32_e32 v27, s46 +; GFX9-NEXT: v_mov_b32_e32 v25, s45 +; GFX9-NEXT: v_mov_b32_e32 v24, s43 +; GFX9-NEXT: v_mov_b32_e32 v23, s44 +; GFX9-NEXT: v_mov_b32_e32 v21, s42 +; GFX9-NEXT: v_mov_b32_e32 v22, s41 +; GFX9-NEXT: v_mov_b32_e32 v20, s40 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 +; GFX9-NEXT: v_mov_b32_e32 v16, s27 +; GFX9-NEXT: v_mov_b32_e32 v17, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-NEXT: v_mov_b32_e32 v13, s8 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: .LBB49_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v10, v36, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v9, v34, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v8, v31, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v28 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v5, v24, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v20i16_to_v40i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s14 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v13 +; GFX11-TRUE16-NEXT: s_branch .LBB49_5 +; GFX11-TRUE16-NEXT: .LBB49_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s61 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s59 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s57 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s47 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s4 +; GFX11-TRUE16-NEXT: .LBB49_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v37, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v32, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v29, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v30, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v36, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v4, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[1:2], off offset:32 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v20i16_to_v40i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s14 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 8, v13 +; GFX11-FAKE16-NEXT: s_branch .LBB49_5 +; GFX11-FAKE16-NEXT: .LBB49_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s2 :: v_dual_mov_b32 v10, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v4, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s20 :: v_dual_mov_b32 v2, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s63 :: v_dual_mov_b32 v39, s61 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s62 :: v_dual_mov_b32 v37, s60 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v35, s58 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s56 :: v_dual_mov_b32 v33, s57 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s47 :: v_dual_mov_b32 v31, s46 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v29, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s44 :: v_dual_mov_b32 v27, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v25, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s28 :: v_dual_mov_b32 v23, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s27 :: v_dual_mov_b32 v21, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s25 :: v_dual_mov_b32 v19, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v7, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s15 :: v_dual_mov_b32 v15, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s4 +; GFX11-FAKE16-NEXT: .LBB49_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v39, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v34, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v32, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v29, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v9, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v10, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v10, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v24, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v37, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v19, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v3, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v4, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[1:2], off offset:32 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <10 x float> %a1 to <40 x i8> + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <40 x i8> br label %end cmp.false: - %a3 = bitcast <10 x float> %a to <40 x i8> + %a3 = bitcast <20 x i16> %a to <40 x i8> br label %end end: @@ -4909,383 +19104,460 @@ end: ret <40 x i8> %phi } -define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i8_to_v10f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v41 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v40 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v55 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v14, v14, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v18, v18, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v22, v22, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v26, v26, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v37, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v7, v13, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v9, v15, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v11, v17, v12 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v13, v19, v16 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v15, v21, v20 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v17, v23, v24 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v19, v25, v27 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: v_or_b32_e32 v4, v8, v9 -; GCN-NEXT: v_or_b32_e32 v5, v10, v11 -; GCN-NEXT: v_or_b32_e32 v6, v12, v13 -; GCN-NEXT: v_or_b32_e32 v7, v14, v15 -; GCN-NEXT: v_or_b32_e32 v8, v16, v17 -; GCN-NEXT: v_or_b32_e32 v9, v18, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: .LBB13_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v54 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_or_b32_e32 v0, v38, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v49, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v50, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v10, v51, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v14, v52, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v18, v27, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v22, v29, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v26, v53, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v37, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_or_b32_e32 v7, v13, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v15, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_or_b32_e32 v11, v17, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v14 -; GCN-NEXT: v_or_b32_e32 v13, v19, v16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v18 -; GCN-NEXT: v_or_b32_e32 v15, v21, v20 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v22 -; GCN-NEXT: v_or_b32_e32 v17, v23, v24 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x300, v26 -; GCN-NEXT: v_or_b32_e32 v19, v25, v27 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v9, v19, v18 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 -; GCN-NEXT: .LBB13_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { +; SI-LABEL: bitcast_v40i8_to_v20i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v31, v14 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v38, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v19 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v29 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v14 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v35, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v2, v49, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v6, v50, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v32, v4, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v25, v0, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v7, v7, v40 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: v_or_b32_e32 v4, v52, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v11, v53, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v10, v7, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v12, v12, v45 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v0, v29 +; SI-NEXT: v_or_b32_e32 v7, v41, v7 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v15, v47, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v14, v12, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v42 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v34, v0, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v16, v16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: v_or_b32_e32 v12, v46, v12 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v19, v56, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_or_b32_e32 v18, v16, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v12, v0, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_or_b32_e32 v16, v59, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v1, v35, v2, 16 +; SI-NEXT: v_alignbit_b32 v5, v32, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v7, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v43 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v59, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v55 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v46, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v47, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v41, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v38 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v31 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v50, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v0 +; SI-NEXT: v_alignbit_b32 v1, v35, v25, 16 +; SI-NEXT: v_alignbit_b32 v5, v32, v21, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v34, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, v25 +; SI-NEXT: v_mov_b32_e32 v2, v35 +; SI-NEXT: v_mov_b32_e32 v4, v21 +; SI-NEXT: v_mov_b32_e32 v6, v32 +; SI-NEXT: v_mov_b32_e32 v8, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v40i8_to_v10f32: +; VI-LABEL: bitcast_v40i8_to_v20i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, v8 -; VI-NEXT: v_mov_b32_e32 v34, v6 -; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v34, v10 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v38, v4 ; VI-NEXT: v_mov_b32_e32 v32, v2 -; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: v_mov_b32_e32 v36, v0 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 -; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v11 -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v13 -; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v15 -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v17 -; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v19 -; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v23 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v31, v14 +; VI-NEXT: v_mov_b32_e32 v37, v12 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23 ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v27 -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v8 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v10 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v36, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v37, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v31, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v18, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v22, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v28, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v8, v39, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v37, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v54, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v55, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr10 -; VI-NEXT: ; implicit-def: $vgpr12 -; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr20 @@ -5294,758 +19566,702 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: .LBB13_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u16_e32 v0, 3, v31 -; VI-NEXT: v_add_u16_e32 v1, 3, v32 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_mov_b32_e32 v9, 0x300 -; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 -; VI-NEXT: v_add_u16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_add_u16_e32 v1, 3, v33 -; VI-NEXT: v_add_u16_e32 v2, 3, v34 -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 -; VI-NEXT: v_add_u16_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_add_u16_e32 v2, 3, v35 -; VI-NEXT: v_add_u16_e32 v3, 3, v10 -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 -; VI-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v3, 3, v12 -; VI-NEXT: v_add_u16_e32 v4, 3, v14 -; VI-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v4, v52, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 -; VI-NEXT: v_add_u16_sdwa v4, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_add_u16_e32 v4, 3, v16 -; VI-NEXT: v_add_u16_e32 v5, 3, v18 -; VI-NEXT: v_or_b32_sdwa v4, v51, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 -; VI-NEXT: v_add_u16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_add_u16_e32 v5, 3, v20 -; VI-NEXT: v_add_u16_e32 v6, 3, v22 -; VI-NEXT: v_or_b32_sdwa v5, v49, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v6, v48, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 -; VI-NEXT: v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v5, v6 -; VI-NEXT: v_add_u16_e32 v6, 3, v24 -; VI-NEXT: v_add_u16_e32 v7, 3, v26 -; VI-NEXT: v_or_b32_sdwa v6, v25, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 -; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_add_u16_e32 v7, 3, v28 -; VI-NEXT: v_add_u16_e32 v8, 3, v30 -; VI-NEXT: v_or_b32_sdwa v7, v21, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 -; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: .LBB50_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB50_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v1, 0x300 +; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v0, 3, v54 +; VI-NEXT: v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v0, 3, v53 +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v8, 3, v39 -; VI-NEXT: v_add_u16_e32 v10, 3, v38 -; VI-NEXT: v_or_b32_sdwa v8, v17, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v10, v15, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 -; VI-NEXT: v_add_u16_sdwa v10, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v8, v10 -; VI-NEXT: v_add_u16_e32 v10, 3, v37 -; VI-NEXT: v_add_u16_e32 v12, 3, v36 -; VI-NEXT: v_or_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v51 +; VI-NEXT: v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v7, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v28 +; VI-NEXT: v_or_b32_sdwa v12, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v26 +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v6, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v24 +; VI-NEXT: v_or_b32_sdwa v13, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v20 +; VI-NEXT: v_or_b32_sdwa v14, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v16 +; VI-NEXT: v_or_b32_sdwa v15, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v31 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v37 +; VI-NEXT: v_or_b32_sdwa v16, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v34 +; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v33 +; VI-NEXT: v_or_b32_sdwa v17, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v18, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v38 +; VI-NEXT: v_or_b32_sdwa v19, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v1, 3, v36 +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v19 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 +; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 +; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 +; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 -; VI-NEXT: v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v18 +; VI-NEXT: v_or_b32_e32 v2, v17, v2 +; VI-NEXT: v_or_b32_e32 v3, v16, v3 +; VI-NEXT: v_or_b32_e32 v4, v15, v4 +; VI-NEXT: v_or_b32_e32 v5, v14, v5 +; VI-NEXT: v_or_b32_e32 v6, v13, v6 +; VI-NEXT: v_or_b32_e32 v7, v12, v7 +; VI-NEXT: v_or_b32_e32 v8, v11, v8 ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: .LBB13_4: ; %end +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v40i8_to_v10f32: +; GFX9-LABEL: bitcast_v40i8_to_v20i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v35, v8 -; GFX9-NEXT: v_mov_b32_e32 v34, v6 -; GFX9-NEXT: v_mov_b32_e32 v33, v4 -; GFX9-NEXT: v_mov_b32_e32 v32, v2 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v31, v10 +; GFX9-NEXT: v_mov_b32_e32 v32, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v6 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v11 -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v13 -; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v15 -; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v17 -; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v19 -; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v21 -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v23 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v37, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v23 ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v8 -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v47, 8, v10 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v35, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v10, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v12, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v14, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v18, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v22, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 +; GFX9-NEXT: v_or_b32_sdwa v2, v32, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 +; GFX9-NEXT: v_or_b32_sdwa v3, v34, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v37, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 ; GFX9-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v28, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 +; GFX9-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v8, v39, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v37, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: v_or_b32_sdwa v8, v55, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v53, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 +; GFX9-NEXT: v_or_b32_sdwa v9, v42, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v54, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr10 -; GFX9-NEXT: ; implicit-def: $vgpr12 -; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr22 -; GFX9-NEXT: ; implicit-def: $vgpr24 -; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr13 -; GFX9-NEXT: ; implicit-def: $vgpr11 -; GFX9-NEXT: .LBB13_2: ; %Flow +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: .LBB50_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_4 +; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 -; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v55 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v53 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v36 +; GFX9-NEXT: v_add_u16_e32 v19, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v19, v39, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_add_u16_e32 v1, 3, v33 -; GFX9-NEXT: v_add_u16_e32 v2, 3, v34 -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 -; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX9-NEXT: v_add_u16_e32 v2, 3, v35 -; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 -; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX9-NEXT: v_add_u16_e32 v3, 3, v12 -; GFX9-NEXT: v_add_u16_e32 v4, 3, v14 -; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v4, v52, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 -; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_add_u16_e32 v4, 3, v16 -; GFX9-NEXT: v_add_u16_e32 v5, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v4, v51, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 -; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX9-NEXT: v_add_u16_e32 v5, 3, v20 -; GFX9-NEXT: v_add_u16_e32 v6, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v5, v49, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v6, v48, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 -; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX9-NEXT: v_add_u16_e32 v6, 3, v24 -; GFX9-NEXT: v_add_u16_e32 v7, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v6, v25, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 -; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX9-NEXT: v_add_u16_e32 v7, 3, v28 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v30 -; GFX9-NEXT: v_or_b32_sdwa v7, v21, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 -; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v8, 3, v39 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v38 -; GFX9-NEXT: v_or_b32_sdwa v8, v17, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v9, v15, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 -; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v37 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v36 -; GFX9-NEXT: v_or_b32_sdwa v9, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 -; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX9-NEXT: .LBB13_4: ; %end +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v19 +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v19, v0, s6 +; GFX9-NEXT: v_perm_b32 v1, v18, v1, s6 +; GFX9-NEXT: v_perm_b32 v2, v17, v2, s6 +; GFX9-NEXT: v_perm_b32 v3, v16, v3, s6 +; GFX9-NEXT: v_perm_b32 v4, v15, v4, s6 +; GFX9-NEXT: v_perm_b32 v5, v14, v5, s6 +; GFX9-NEXT: v_perm_b32 v6, v13, v6, s6 +; GFX9-NEXT: v_perm_b32 v7, v12, v7, s6 +; GFX9-NEXT: v_perm_b32 v8, v11, v8, s6 +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 +; GFX9-NEXT: .LBB50_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v10f32: +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v20i16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x9 -; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v37 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-TRUE16-NEXT: .LBB13_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB13_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v29.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v35.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-TRUE16-NEXT: .LBB13_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v20.h, 3 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v35.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v34.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v35.h, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v23.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v28.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v30.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v29.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v22.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v24.h, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v19.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v15.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v19.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.l, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v22.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v21.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v20.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v29.h, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v15.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v16.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v26.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v27.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v25.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v25.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v23.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v21.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v19.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v17.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v18.h, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v27.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v16.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v18.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v2.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v14.h, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v14.l, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v31.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v11.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v11.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v11.l ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v10f32: +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v20i16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v8 :: v_dual_mov_b32 v34, v6 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, v4 :: v_dual_mov_b32 v32, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, v10 :: v_dual_mov_b32 v34, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v35, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0 ; GFX11-FAKE16-NEXT: s_clause 0x9 ; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:36 ; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 ; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v36, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v37, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_u16 v38, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v49, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v19 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, v14 :: v_dual_mov_b32 v32, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v49, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v19 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v29 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v8 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v10 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-FAKE16-NEXT: .LBB13_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB13_3: ; %cmp.false -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v14 +; GFX11-FAKE16-NEXT: .LBB50_3: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v16 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v48 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v49 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v6, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v8, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v30 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v71 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v8, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v10, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v12, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v14, v13, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 @@ -6054,234 +20270,1480 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-FAKE16-NEXT: .LBB13_4: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v34, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v35, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v10, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v12, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v14, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v16, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v18, 3 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-FAKE16-NEXT: .LBB50_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v66, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v28, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v67, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v70, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v29, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v27, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v69, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v30, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v20, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v26, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v24, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v22, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v11, v8, 0x5040100 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v53, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v54, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v25, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v64, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v18, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v16, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v19, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v31, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v54, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v53, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v34, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v36, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v17, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v52, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v35, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v37, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v38, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v33, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v55, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v64, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v65, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v48, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v49, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v50, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v51, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v52, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v51, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v48, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v49, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v50, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v39, v19 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v6, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v8, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v20, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v22, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v24, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v26, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v28, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v30, 3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v39, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v38, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v37, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v36, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v17, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v18, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v19, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v20, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v21, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v14, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v13, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v12, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v10, v9, 0x5040100 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <20 x i16> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <20 x i16> + br label %end + +end: + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi +} + +define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i8_to_v20i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_readfirstlane_b32 s14, v19 +; SI-NEXT: v_readfirstlane_b32 s40, v18 +; SI-NEXT: v_readfirstlane_b32 s12, v11 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_readfirstlane_b32 s9, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v21 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s10, s23, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s10, s5 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s10, s19, 24 +; SI-NEXT: s_or_b32 s4, s10, s4 +; SI-NEXT: s_and_b32 s10, s28, 0xff +; SI-NEXT: s_lshl_b32 s15, s29, 8 +; SI-NEXT: s_or_b32 s10, s10, s15 +; SI-NEXT: s_and_b32 s15, s6, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_lshl_b32 s41, s7, 24 +; SI-NEXT: s_or_b32 s43, s41, s15 +; SI-NEXT: s_and_b32 s15, s26, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_lshl_b32 s41, s27, 24 +; SI-NEXT: s_or_b32 s15, s41, s15 +; SI-NEXT: s_and_b32 s41, s16, 0xff +; SI-NEXT: s_lshl_b32 s42, s17, 8 +; SI-NEXT: s_or_b32 s41, s41, s42 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v8 +; SI-NEXT: s_or_b32 s41, s41, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s42, s25, 8 +; SI-NEXT: v_or_b32_e32 v9, v9, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_or_b32 s4, s4, s42 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v11, v0, v10 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_or_b32_e32 v10, v9, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v16 +; SI-NEXT: s_or_b32 s15, s4, s15 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s42, s8, 8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v13, v13, v27 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_or_b32 s4, s4, s42 +; SI-NEXT: v_or_b32_e32 v15, v3, v9 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v19, v7, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v25, v13, v19 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v23, s4, v15 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s42, s12, 8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v17, v17, v30 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_or_b32 s4, s4, s42 +; SI-NEXT: v_or_b32_e32 v21, v28, v13 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v32, v29, v18 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v18, v17, v32 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v26, s4, v21 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s42, s14, 8 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_or_b32 s4, s4, s42 +; SI-NEXT: s_or_b32 s10, s10, s43 +; SI-NEXT: v_or_b32_e32 v33, v31, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_alignbit_b32 v1, s11, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, s10, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v15, 16 +; SI-NEXT: v_alignbit_b32 v13, v25, v21, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v33, 16 +; SI-NEXT: v_or_b32_e32 v21, s4, v33 +; SI-NEXT: s_lshr_b32 s42, s5, 16 +; SI-NEXT: s_lshr_b32 s43, s43, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s5, s14, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v5, v29, v5 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v1 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s12, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v1 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s15, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s7, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s10, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: s_add_i32 s41, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: s_add_i32 s11, s4, 0x3000000 +; SI-NEXT: v_mov_b32_e32 v0, s41 +; SI-NEXT: v_alignbit_b32 v1, s11, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s15 +; SI-NEXT: v_alignbit_b32 v5, s10, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v25, v26, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v21, 16 +; SI-NEXT: s_lshr_b32 s42, s11, 16 +; SI-NEXT: s_lshr_b32 s43, s10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s41 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v3, s42 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, v23 +; SI-NEXT: v_mov_b32_e32 v12, v26 +; SI-NEXT: v_mov_b32_e32 v14, v25 +; SI-NEXT: v_mov_b32_e32 v16, v21 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v40i8_to_v20i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v31, v13 +; VI-NEXT: v_mov_b32_e32 v36, v12 +; VI-NEXT: v_mov_b32_e32 v29, v10 +; VI-NEXT: v_mov_b32_e32 v33, v9 +; VI-NEXT: v_mov_b32_e32 v27, v8 +; VI-NEXT: v_mov_b32_e32 v38, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v30, v4 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_mov_b32_e32 v32, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_e32 v3, s7, v0 +; VI-NEXT: v_or_b32_sdwa v0, v37, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; VI-NEXT: v_or_b32_sdwa v0, v38, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; VI-NEXT: v_or_b32_sdwa v0, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v28, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v27 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v34 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v36 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s10, s29, 8 +; VI-NEXT: s_and_b32 s11, s28, 0xff +; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; VI-NEXT: v_or_b32_e32 v3, v3, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v33 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s25, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s24, 0xff +; VI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; VI-NEXT: v_or_b32_e32 v4, v4, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v17 +; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v31 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v28 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s11, s20, 0xff +; VI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; VI-NEXT: v_or_b32_e32 v5, v5, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: v_or_b32_sdwa v17, v49, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s8, s11 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 +; VI-NEXT: s_or_b32 s7, s7, s11 +; VI-NEXT: s_and_b32 s13, s18, 0xff +; VI-NEXT: v_or_b32_e32 v2, v2, v7 +; VI-NEXT: s_lshl_b32 s6, s19, 24 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_and_b32 s12, s22, 0xff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 +; VI-NEXT: s_and_b32 s11, s26, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s8, 0xffff +; VI-NEXT: s_lshl_b32 s8, s12, 16 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v37 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v38 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v22 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s8, s11, 16 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 +; VI-NEXT: v_or_b32_sdwa v20, v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v16, v48, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v15, v39, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v14, v26, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s9, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v20 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s10, 0xffff +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s7, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v40i8_to_v20i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v30, v10 +; GFX9-NEXT: v_mov_b32_e32 v27, v8 +; GFX9-NEXT: v_mov_b32_e32 v28, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v4 +; GFX9-NEXT: v_mov_b32_e32 v31, v2 +; GFX9-NEXT: v_mov_b32_e32 v32, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v35, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, s7, v1 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v28 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v27 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v31 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v3, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: s_or_b32 s10, s11, s10 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v3, v26, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: v_mov_b32_e32 v13, 0xffff +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v13, s4, v13 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v4, v12, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v5, v11, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v10, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v9, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v20i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v23 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v7, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v12, 16, v13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v38 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v32 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v17 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v12, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v10, 16, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v5, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v11, 16, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v9, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v20i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v23 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v2, 0xffff, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v24 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v6, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v6, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v9, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v12, 16, v13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v27 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v36, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v35, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v21, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v25, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v27, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v29, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v11, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v13, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v15, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v17, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v19, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v33, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v23 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v18 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v25 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v34, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v19, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v37, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v32, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v22, v4 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v31, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v10, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v11, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: %a1 = add <40 x i8> %a, splat (i8 3) - %a2 = bitcast <40 x i8> %a1 to <10 x float> + %a2 = bitcast <40 x i8> %a1 to <20 x i16> br label %end cmp.false: - %a3 = bitcast <40 x i8> %a to <10 x float> + %a3 = bitcast <40 x i8> %a to <20 x i16> br label %end end: - %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <10 x float> %phi + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi } -define <5 x double> @bitcast_v10f32_to_v5f64(<10 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f32_to_v5f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB14_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v20i16_to_v5f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v12 +; SI-NEXT: v_mov_b32_e32 v26, v10 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v24, v2 +; SI-NEXT: v_mov_b32_e32 v25, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB52_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB52_4 +; SI-NEXT: .LBB52_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB52_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_or_b32_e32 v5, v5, v31 +; SI-NEXT: v_or_b32_e32 v6, v6, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v29 +; SI-NEXT: v_or_b32_e32 v8, v8, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v17 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: .LBB52_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v33, v3 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v5, v31, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v29, v7 +; SI-NEXT: v_or_b32_e32 v8, v20, v8 +; SI-NEXT: v_or_b32_e32 v9, v17, v9 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v10f32_to_v5f64: +; VI-LABEL: bitcast_v20i16_to_v5f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB14_2: ; %end +; VI-NEXT: v_mov_b32_e32 v11, 3 +; VI-NEXT: v_add_u16_e32 v10, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_u16_e32 v10, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v10, v8 +; VI-NEXT: v_add_u16_e32 v10, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v10, v7 +; VI-NEXT: v_add_u16_e32 v10, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v10, v6 +; VI-NEXT: v_add_u16_e32 v10, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v10, v5 +; VI-NEXT: v_add_u16_e32 v10, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v10, v4 +; VI-NEXT: v_add_u16_e32 v10, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v10, v3 +; VI-NEXT: v_add_u16_e32 v10, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v10, v2 +; VI-NEXT: v_add_u16_e32 v10, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_add_u16_e32 v10, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v10, v0 +; VI-NEXT: .LBB52_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v10f32_to_v5f64: +; GFX9-LABEL: bitcast_v20i16_to_v5f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB14_2: ; %end +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB52_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v10f32_to_v5f64: +; GFX11-LABEL: bitcast_v20i16_to_v5f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -6289,25 +21751,31 @@ define <5 x double> @bitcast_v10f32_to_v5f64(<10 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB52_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <10 x float> %a1 to <5 x double> + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <5 x double> br label %end cmp.false: - %a3 = bitcast <10 x float> %a to <5 x double> + %a3 = bitcast <20 x i16> %a to <5 x double> br label %end end: @@ -6315,62 +21783,390 @@ end: ret <5 x double> %phi } -define <10 x float> @bitcast_v5f64_to_v10f32(<5 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v5f64_to_v10f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <5 x double> @bitcast_v20i16_to_v5f64_scalar(<20 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i16_to_v5f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v17, v2 +; SI-NEXT: v_mov_b32_e32 v18, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v7, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v20 +; SI-NEXT: v_or_b32_e32 v9, v0, v19 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB53_2 ; -; VI-LABEL: bitcast_v5f64_to_v10f32: +; VI-LABEL: bitcast_v20i16_to_v5f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v20i16_to_v5f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_4 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_3: +; GFX9-NEXT: s_branch .LBB53_2 +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20i16_to_v5f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_4 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_3: +; GFX11-NEXT: s_branch .LBB53_2 +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <20 x i16> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} + +define <20 x i16> @bitcast_v5f64_to_v20i16(<5 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v5f64_to_v20i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v9 +; SI-NEXT: v_mov_b32_e32 v27, v8 +; SI-NEXT: v_mov_b32_e32 v26, v7 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v22, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v13, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v9, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v5, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_alignbit_b32 v17, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v13, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v9, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v5, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v4, v21 +; SI-NEXT: v_mov_b32_e32 v6, v22 +; SI-NEXT: v_mov_b32_e32 v8, v23 +; SI-NEXT: v_mov_b32_e32 v10, v24 +; SI-NEXT: v_mov_b32_e32 v12, v25 +; SI-NEXT: v_mov_b32_e32 v14, v26 +; SI-NEXT: v_mov_b32_e32 v16, v27 +; SI-NEXT: v_mov_b32_e32 v18, v28 +; SI-NEXT: v_mov_b32_e32 v1, v20 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v20i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB15_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v5f64_to_v10f32: +; GFX9-LABEL: bitcast_v5f64_to_v20i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB15_2: ; %end +; GFX9-NEXT: .LBB54_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v5f64_to_v10f32: +; GFX11-LABEL: bitcast_v5f64_to_v20i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -6378,14 +22174,14 @@ define <10 x float> @bitcast_v5f64_to_v10f32(<5 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB15_2: ; %end +; GFX11-NEXT: .LBB54_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6393,89 +22189,407 @@ define <10 x float> @bitcast_v5f64_to_v10f32(<5 x double> %a, i32 %b) { cmp.true: %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <5 x double> %a1 to <10 x float> + %a2 = bitcast <5 x double> %a1 to <20 x i16> br label %end cmp.false: - %a3 = bitcast <5 x double> %a to <10 x float> + %a3 = bitcast <5 x double> %a to <20 x i16> br label %end end: - %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <10 x float> %phi + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi } -define <5 x i64> @bitcast_v10f32_to_v5i64(<10 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f32_to_v5i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <20 x i16> @bitcast_v5f64_to_v20i16_scalar(<5 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f64_to_v20i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB55_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v20, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v21, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v22, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v23, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v24, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB55_4 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[16:17], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[22:23], 1.0 +; SI-NEXT: v_alignbit_b32 v20, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v21, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: s_branch .LBB55_5 +; SI-NEXT: .LBB55_3: +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: s_branch .LBB55_2 +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v5, s19 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: .LBB55_5: ; %end +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v6, v5 +; SI-NEXT: v_mov_b32_e32 v10, v9 +; SI-NEXT: v_mov_b32_e32 v14, v13 +; SI-NEXT: v_mov_b32_e32 v18, v17 +; SI-NEXT: v_mov_b32_e32 v1, v24 +; SI-NEXT: v_mov_b32_e32 v5, v23 +; SI-NEXT: v_mov_b32_e32 v9, v22 +; SI-NEXT: v_mov_b32_e32 v13, v21 +; SI-NEXT: v_mov_b32_e32 v17, v20 +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v10f32_to_v5i64: +; VI-LABEL: bitcast_v5f64_to_v20i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB55_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_4 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_3: +; VI-NEXT: s_branch .LBB55_2 +; VI-NEXT: .LBB55_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v20i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: +; GFX9-NEXT: s_branch .LBB55_2 +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v20i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <20 x i16> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <20 x i16> + br label %end + +end: + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi +} + +define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v20i16_to_v5i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v12 +; SI-NEXT: v_mov_b32_e32 v26, v10 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v24, v2 +; SI-NEXT: v_mov_b32_e32 v25, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB56_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB56_4 +; SI-NEXT: .LBB56_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB56_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_or_b32_e32 v5, v5, v31 +; SI-NEXT: v_or_b32_e32 v6, v6, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v29 +; SI-NEXT: v_or_b32_e32 v8, v8, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v17 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: .LBB56_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v33, v3 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v5, v31, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v29, v7 +; SI-NEXT: v_or_b32_e32 v8, v20, v8 +; SI-NEXT: v_or_b32_e32 v9, v17, v9 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20i16_to_v5i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB16_2: ; %end +; VI-NEXT: v_mov_b32_e32 v11, 3 +; VI-NEXT: v_add_u16_e32 v10, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_u16_e32 v10, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v10, v8 +; VI-NEXT: v_add_u16_e32 v10, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v10, v7 +; VI-NEXT: v_add_u16_e32 v10, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v10, v6 +; VI-NEXT: v_add_u16_e32 v10, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v10, v5 +; VI-NEXT: v_add_u16_e32 v10, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v10, v4 +; VI-NEXT: v_add_u16_e32 v10, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v10, v3 +; VI-NEXT: v_add_u16_e32 v10, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v10, v2 +; VI-NEXT: v_add_u16_e32 v10, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_add_u16_e32 v10, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v10, v0 +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v10f32_to_v5i64: +; GFX9-LABEL: bitcast_v20i16_to_v5i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB16_2: ; %end +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v10f32_to_v5i64: +; GFX11-LABEL: bitcast_v20i16_to_v5i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -6483,25 +22597,31 @@ define <5 x i64> @bitcast_v10f32_to_v5i64(<10 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB56_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <10 x float> %a1 to <5 x i64> + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <5 x i64> br label %end cmp.false: - %a3 = bitcast <10 x float> %a to <5 x i64> + %a3 = bitcast <20 x i16> %a to <5 x i64> br label %end end: @@ -6509,38 +22629,358 @@ end: ret <5 x i64> %phi } -define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v5i64_to_v10f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <5 x i64> @bitcast_v20i16_to_v5i64_scalar(<20 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i16_to_v5i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v17, v2 +; SI-NEXT: v_mov_b32_e32 v18, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v7, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v20 +; SI-NEXT: v_or_b32_e32 v9, v0, v19 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB57_2 ; -; VI-LABEL: bitcast_v5i64_to_v10f32: +; VI-LABEL: bitcast_v20i16_to_v5i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v20i16_to_v5i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20i16_to_v5i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_3: +; GFX11-NEXT: s_branch .LBB57_2 +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <20 x i16> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + +define <20 x i16> @bitcast_v5i64_to_v20i16(<5 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v5i64_to_v20i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB58_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB58_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v10, v20 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5i64_to_v20i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc @@ -6552,18 +22992,18 @@ define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v5i64_to_v10f32: +; GFX9-LABEL: bitcast_v5i64_to_v20i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc @@ -6575,11 +23015,11 @@ define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v5i64_to_v10f32: +; GFX11-LABEL: bitcast_v5i64_to_v20i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -6587,7 +23027,7 @@ define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -6602,7 +23042,7 @@ define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: .LBB58_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6610,384 +23050,584 @@ define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) { cmp.true: %a1 = add <5 x i64> %a, splat (i64 3) - %a2 = bitcast <5 x i64> %a1 to <10 x float> + %a2 = bitcast <5 x i64> %a1 to <20 x i16> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <20 x i16> + br label %end + +end: + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi +} + +define inreg <20 x i16> @bitcast_v5i64_to_v20i16_scalar(<5 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i64_to_v20i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v5i64_to_v20i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB59_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_3 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB59_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_4: +; VI-NEXT: s_branch .LBB59_2 +; +; GFX9-LABEL: bitcast_v5i64_to_v20i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_3 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB59_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: s_branch .LBB59_2 +; +; GFX11-LABEL: bitcast_v5i64_to_v20i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_3 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB59_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: s_branch .LBB59_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <20 x i16> br label %end cmp.false: - %a3 = bitcast <5 x i64> %a to <10 x float> + %a3 = bitcast <5 x i64> %a to <20 x i16> br label %end end: - %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <10 x float> %phi + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi } define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f16_to_v40i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v3 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v7 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v11 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v19 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; GCN-NEXT: v_bfe_u32 v14, v21, 8, 8 -; GCN-NEXT: v_bfe_u32 v11, v4, 8, 8 -; GCN-NEXT: v_bfe_u32 v8, v3, 8, 8 -; GCN-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GCN-NEXT: v_or_b32_e32 v29, v50, v6 -; GCN-NEXT: v_or_b32_e32 v27, v49, v7 -; GCN-NEXT: v_or_b32_e32 v20, v52, v9 -; GCN-NEXT: v_or_b32_e32 v17, v51, v10 -; GCN-NEXT: v_or_b32_e32 v13, v54, v12 -; GCN-NEXT: v_or_b32_e32 v12, v53, v15 -; GCN-NEXT: v_or_b32_e32 v10, v42, v16 -; GCN-NEXT: v_or_b32_e32 v9, v40, v18 -; GCN-NEXT: v_or_b32_e32 v7, v45, v19 -; GCN-NEXT: v_or_b32_e32 v6, v44, v22 -; GCN-NEXT: v_alignbit_b32 v34, v27, v29, 24 -; GCN-NEXT: v_alignbit_b32 v36, v27, v29, 16 -; GCN-NEXT: v_alignbit_b32 v38, v27, v29, 8 -; GCN-NEXT: v_alignbit_b32 v32, v17, v20, 24 -; GCN-NEXT: v_alignbit_b32 v33, v17, v20, 16 -; GCN-NEXT: v_alignbit_b32 v35, v17, v20, 8 -; GCN-NEXT: v_alignbit_b32 v26, v12, v13, 24 -; GCN-NEXT: v_alignbit_b32 v28, v12, v13, 16 -; GCN-NEXT: v_alignbit_b32 v31, v12, v13, 8 -; GCN-NEXT: v_alignbit_b32 v19, v9, v10, 24 -; GCN-NEXT: v_alignbit_b32 v24, v9, v10, 16 -; GCN-NEXT: v_alignbit_b32 v25, v9, v10, 8 -; GCN-NEXT: v_alignbit_b32 v15, v6, v7, 24 -; GCN-NEXT: v_alignbit_b32 v16, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v18, v6, v7, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 8, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 8, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 8, v6 -; GCN-NEXT: v_bfe_u32 v22, v1, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: .LBB18_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v49 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v21 -; GCN-NEXT: v_bfe_u32 v14, v21, 8, 8 -; GCN-NEXT: v_bfe_u32 v11, v4, 8, 8 -; GCN-NEXT: v_bfe_u32 v8, v3, 8, 8 -; GCN-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GCN-NEXT: v_or_b32_e32 v7, v6, v13 -; GCN-NEXT: v_or_b32_e32 v6, v22, v17 -; GCN-NEXT: v_or_b32_e32 v10, v9, v20 -; GCN-NEXT: v_or_b32_e32 v9, v23, v25 -; GCN-NEXT: v_or_b32_e32 v13, v12, v26 -; GCN-NEXT: v_or_b32_e32 v12, v24, v27 -; GCN-NEXT: v_or_b32_e32 v20, v15, v28 -; GCN-NEXT: v_or_b32_e32 v17, v16, v29 -; GCN-NEXT: v_or_b32_e32 v29, v18, v30 -; GCN-NEXT: v_or_b32_e32 v27, v19, v31 -; GCN-NEXT: v_alignbit_b32 v34, v27, v29, 24 -; GCN-NEXT: v_alignbit_b32 v36, v27, v29, 16 -; GCN-NEXT: v_alignbit_b32 v38, v27, v29, 8 -; GCN-NEXT: v_alignbit_b32 v32, v17, v20, 24 -; GCN-NEXT: v_alignbit_b32 v33, v17, v20, 16 -; GCN-NEXT: v_alignbit_b32 v35, v17, v20, 8 -; GCN-NEXT: v_alignbit_b32 v26, v12, v13, 24 -; GCN-NEXT: v_alignbit_b32 v28, v12, v13, 16 -; GCN-NEXT: v_alignbit_b32 v31, v12, v13, 8 -; GCN-NEXT: v_alignbit_b32 v19, v9, v10, 24 -; GCN-NEXT: v_alignbit_b32 v24, v9, v10, 16 -; GCN-NEXT: v_alignbit_b32 v25, v9, v10, 8 -; GCN-NEXT: v_alignbit_b32 v15, v6, v7, 24 -; GCN-NEXT: v_alignbit_b32 v16, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v18, v6, v7, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 8, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 8, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 8, v6 -; GCN-NEXT: v_bfe_u32 v22, v1, 8, 8 -; GCN-NEXT: .LBB18_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v38 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v34 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v48 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v39 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v37 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v19 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 8, v30 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 24, v22 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 36, v0 -; GCN-NEXT: v_or_b32_e32 v29, v29, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v35, v50, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v17, v17, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v13, v13, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v12, v12, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v10, v10, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v9, v9, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v7, v7, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v6, v6, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v23, v34, v36 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v21, v49, v21 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v35 -; GCN-NEXT: v_or_b32_e32 v29, v32, v33 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v4, v11, v4 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v26, v28 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v3, v8, v3 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v19, v24 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v15, v16 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v1, v22, v1 -; GCN-NEXT: v_or_b32_e32 v15, v18, v23 -; GCN-NEXT: v_or_b32_e32 v16, v25, v21 -; GCN-NEXT: v_or_b32_e32 v18, v27, v29 -; GCN-NEXT: v_or_b32_e32 v4, v17, v4 -; GCN-NEXT: v_or_b32_e32 v11, v11, v13 -; GCN-NEXT: v_or_b32_e32 v3, v12, v3 -; GCN-NEXT: v_or_b32_e32 v8, v8, v10 -; GCN-NEXT: v_or_b32_e32 v2, v9, v2 -; GCN-NEXT: v_or_b32_e32 v5, v5, v7 -; GCN-NEXT: v_or_b32_e32 v1, v6, v1 -; GCN-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20f16_to_v40i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v41, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v11 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v47, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_or_b32_e32 v24, v50, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_or_b32_e32 v19, v49, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v54 +; SI-NEXT: v_or_b32_e32 v12, v53, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_or_b32_e32 v11, v52, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v41 +; SI-NEXT: v_or_b32_e32 v9, v40, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_or_b32_e32 v10, v55, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v44 +; SI-NEXT: v_or_b32_e32 v7, v43, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_or_b32_e32 v8, v42, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v6, v46, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_or_b32_e32 v5, v45, v5 +; SI-NEXT: v_alignbit_b32 v26, v19, v24, 24 +; SI-NEXT: v_alignbit_b32 v30, v19, v24, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v24, 8 +; SI-NEXT: v_alignbit_b32 v25, v11, v12, 24 +; SI-NEXT: v_alignbit_b32 v27, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v31, v11, v12, 8 +; SI-NEXT: v_alignbit_b32 v18, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v21, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v23, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v13, v5, v6, 24 +; SI-NEXT: v_alignbit_b32 v14, v5, v6, 16 +; SI-NEXT: v_alignbit_b32 v17, v5, v6, 8 +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_bfe_u32 v48, v22, 8, 8 +; SI-NEXT: v_bfe_u32 v38, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v36, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v34, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v29, v1, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: .LBB60_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v6, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v12, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v24, v13, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_or_b32_e32 v19, v15, v13 +; SI-NEXT: v_alignbit_b32 v26, v19, v24, 24 +; SI-NEXT: v_alignbit_b32 v30, v19, v24, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v24, 8 +; SI-NEXT: v_alignbit_b32 v25, v11, v12, 24 +; SI-NEXT: v_alignbit_b32 v27, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v31, v11, v12, 8 +; SI-NEXT: v_alignbit_b32 v18, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v21, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v23, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v13, v5, v6, 24 +; SI-NEXT: v_alignbit_b32 v14, v5, v6, 16 +; SI-NEXT: v_alignbit_b32 v17, v5, v6, 8 +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_bfe_u32 v48, v22, 8, 8 +; SI-NEXT: v_bfe_u32 v38, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v36, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v34, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v29, v1, 8, 8 +; SI-NEXT: .LBB60_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v26 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v26, v26, v30 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v39 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v19, v19, v24 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v48 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_or_b32_e32 v19, v19, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v19, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v31 +; SI-NEXT: v_or_b32_e32 v12, v12, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v25 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v19, v22, v19 +; SI-NEXT: v_or_b32_e32 v12, v12, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v37 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v38 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v4, v12, v4 +; SI-NEXT: v_or_b32_e32 v4, v11, v4 +; SI-NEXT: v_add_i32_e32 v11, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v4, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v28 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v36 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f16_to_v40i8: ; VI: ; %bb.0: @@ -7025,7 +23665,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: s_cbranch_execz .LBB60_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -7047,9 +23687,9 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v2 ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB18_2: ; %Flow +; VI-NEXT: .LBB60_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_4 +; VI-NEXT: s_cbranch_execz .LBB60_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v11, 0x200 ; VI-NEXT: v_add_f16_sdwa v23, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -7112,7 +23752,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; VI-NEXT: v_bfe_u32 v35, v19, 8, 8 ; VI-NEXT: v_bfe_u32 v38, v21, 8, 8 ; VI-NEXT: v_bfe_u32 v48, v23, 8, 8 -; VI-NEXT: .LBB18_4: ; %end +; VI-NEXT: .LBB60_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 @@ -7222,7 +23862,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: s_cbranch_execz .LBB60_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -7254,9 +23894,9 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB18_2: ; %Flow +; GFX9-NEXT: .LBB60_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_4 +; GFX9-NEXT: s_cbranch_execz .LBB60_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] @@ -7299,7 +23939,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB18_4: ; %end +; GFX9-NEXT: .LBB60_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 @@ -7391,7 +24031,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB60_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -7413,9 +24053,9 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB18_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB60_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB60_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] @@ -7447,7 +24087,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB18_4: ; %end +; GFX11-TRUE16-NEXT: .LBB60_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -7585,7 +24225,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB60_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -7617,9 +24257,9 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB18_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB60_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB60_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] @@ -7661,7 +24301,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB18_4: ; %end +; GFX11-FAKE16-NEXT: .LBB60_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -7776,310 +24416,1649 @@ end: ret <40 x i8> %phi } +define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f16_to_v40i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v16, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s26 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v43, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v47, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB61_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_or_b32_e32 v28, v15, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_or_b32_e32 v24, v12, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_or_b32_e32 v14, v33, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_or_b32_e32 v13, v20, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v7, v53, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_or_b32_e32 v11, v50, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_or_b32_e32 v5, v44, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_or_b32_e32 v6, v41, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 +; SI-NEXT: v_or_b32_e32 v4, v46, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_or_b32_e32 v3, v45, v3 +; SI-NEXT: v_alignbit_b32 v30, v24, v28, 24 +; SI-NEXT: v_alignbit_b32 v35, v24, v28, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v28, 8 +; SI-NEXT: v_alignbit_b32 v29, v13, v14, 24 +; SI-NEXT: v_alignbit_b32 v31, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v36, v13, v14, 8 +; SI-NEXT: v_alignbit_b32 v23, v11, v7, 24 +; SI-NEXT: v_alignbit_b32 v26, v11, v7, 16 +; SI-NEXT: v_alignbit_b32 v32, v11, v7, 8 +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v27, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v17, v3, v4, 24 +; SI-NEXT: v_alignbit_b32 v18, v3, v4, 16 +; SI-NEXT: v_alignbit_b32 v22, v3, v4, 8 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; SI-NEXT: v_bfe_u32 v42, v10, 8, 8 +; SI-NEXT: v_bfe_u32 v55, v9, 8, 8 +; SI-NEXT: v_bfe_u32 v51, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v48, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v34, v1, 8, 8 +; SI-NEXT: s_cbranch_execnz .LBB61_3 +; SI-NEXT: .LBB61_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v50 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_or_b32_e32 v28, v15, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v10 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_or_b32_e32 v24, v12, v15 +; SI-NEXT: v_alignbit_b32 v30, v24, v28, 24 +; SI-NEXT: v_alignbit_b32 v35, v24, v28, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v28, 8 +; SI-NEXT: v_alignbit_b32 v29, v13, v14, 24 +; SI-NEXT: v_alignbit_b32 v31, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v36, v13, v14, 8 +; SI-NEXT: v_alignbit_b32 v23, v11, v7, 24 +; SI-NEXT: v_alignbit_b32 v26, v11, v7, 16 +; SI-NEXT: v_alignbit_b32 v32, v11, v7, 8 +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v27, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v17, v3, v4, 24 +; SI-NEXT: v_alignbit_b32 v18, v3, v4, 16 +; SI-NEXT: v_alignbit_b32 v22, v3, v4, 8 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; SI-NEXT: v_bfe_u32 v42, v10, 8, 8 +; SI-NEXT: v_bfe_u32 v55, v9, 8, 8 +; SI-NEXT: v_bfe_u32 v51, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v48, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v34, v1, 8, 8 +; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: v_and_b32_e32 v12, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v37 +; SI-NEXT: v_or_b32_e32 v12, v12, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v30 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v12, v12, v15 +; SI-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v40 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v12, v12, v15 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v42 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v15, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_add_i32_e32 v12, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v36 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v29 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v55 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v49 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v51 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v27 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v38 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v48 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v17 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v34 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_branch .LBB61_2 +; +; VI-LABEL: bitcast_v20f16_to_v40i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB61_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s41, s25, 24 +; VI-NEXT: s_lshr_b32 s59, s25, 16 +; VI-NEXT: s_lshr_b32 s26, s25, 8 +; VI-NEXT: s_lshr_b32 s60, s24, 16 +; VI-NEXT: s_lshr_b32 s27, s24, 8 +; VI-NEXT: s_lshr_b32 s43, s23, 24 +; VI-NEXT: s_lshr_b32 s61, s23, 16 +; VI-NEXT: s_lshr_b32 s28, s23, 8 +; VI-NEXT: s_lshr_b32 s62, s22, 16 +; VI-NEXT: s_lshr_b32 s29, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s63, s21, 16 +; VI-NEXT: s_lshr_b32 s40, s21, 8 +; VI-NEXT: s_lshr_b32 s72, s20, 16 +; VI-NEXT: s_lshr_b32 s42, s20, 8 +; VI-NEXT: s_lshr_b32 s57, s19, 24 +; VI-NEXT: s_lshr_b32 s73, s19, 16 +; VI-NEXT: s_lshr_b32 s44, s19, 8 +; VI-NEXT: s_lshr_b32 s74, s18, 16 +; VI-NEXT: s_lshr_b32 s45, s18, 8 +; VI-NEXT: s_lshr_b32 s58, s17, 24 +; VI-NEXT: s_lshr_b32 s75, s17, 16 +; VI-NEXT: s_lshr_b32 s47, s17, 8 +; VI-NEXT: s_lshr_b32 s76, s16, 16 +; VI-NEXT: s_lshr_b32 s56, s16, 8 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB61_4 +; VI-NEXT: .LBB61_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x200 +; VI-NEXT: v_add_f16_e32 v8, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; VI-NEXT: v_add_f16_e32 v17, s17, v1 +; VI-NEXT: v_add_f16_e32 v12, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v39, v17, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; VI-NEXT: v_add_f16_e32 v22, s16, v1 +; VI-NEXT: v_add_f16_e32 v9, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v38, v22, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; VI-NEXT: v_add_f16_e32 v18, s19, v1 +; VI-NEXT: v_add_f16_e32 v13, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v36, v18, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; VI-NEXT: v_add_f16_e32 v23, s18, v1 +; VI-NEXT: v_add_f16_e32 v10, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v35, v23, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; VI-NEXT: v_add_f16_e32 v19, s21, v1 +; VI-NEXT: v_add_f16_e32 v14, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v33, v19, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; VI-NEXT: v_add_f16_e32 v24, s20, v1 +; VI-NEXT: v_add_f16_e32 v11, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v32, v24, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; VI-NEXT: v_add_f16_e32 v20, s23, v1 +; VI-NEXT: v_add_f16_e32 v15, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v30, v20, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; VI-NEXT: v_add_f16_e32 v25, s22, v1 +; VI-NEXT: v_add_f16_e32 v7, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v29, v25, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; VI-NEXT: v_add_f16_e32 v21, s25, v1 +; VI-NEXT: v_add_f16_e32 v16, s4, v1 +; VI-NEXT: v_or_b32_e32 v49, v21, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; VI-NEXT: v_add_f16_e32 v26, s24, v1 +; VI-NEXT: v_or_b32_e32 v48, v26, v2 +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[48:49] +; VI-NEXT: v_lshrrev_b64 v[2:3], 24, v[29:30] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[35:36] +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[38:39] +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v49 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v48 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v38 +; VI-NEXT: v_bfe_u32 v6, v7, 8, 8 +; VI-NEXT: v_bfe_u32 v29, v11, 8, 8 +; VI-NEXT: v_bfe_u32 v32, v10, 8, 8 +; VI-NEXT: v_bfe_u32 v35, v9, 8, 8 +; VI-NEXT: v_bfe_u32 v38, v8, 8, 8 +; VI-NEXT: s_branch .LBB61_5 +; VI-NEXT: .LBB61_3: +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: s_branch .LBB61_2 +; VI-NEXT: .LBB61_4: +; VI-NEXT: v_mov_b32_e32 v12, s76 +; VI-NEXT: v_mov_b32_e32 v8, s75 +; VI-NEXT: v_mov_b32_e32 v13, s74 +; VI-NEXT: v_mov_b32_e32 v9, s73 +; VI-NEXT: v_mov_b32_e32 v14, s72 +; VI-NEXT: v_mov_b32_e32 v10, s63 +; VI-NEXT: v_mov_b32_e32 v15, s62 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v16, s60 +; VI-NEXT: v_mov_b32_e32 v7, s59 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v17, s17 +; VI-NEXT: v_mov_b32_e32 v23, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 +; VI-NEXT: v_mov_b32_e32 v24, s20 +; VI-NEXT: v_mov_b32_e32 v19, s21 +; VI-NEXT: v_mov_b32_e32 v25, s22 +; VI-NEXT: v_mov_b32_e32 v20, s23 +; VI-NEXT: v_mov_b32_e32 v26, s24 +; VI-NEXT: v_mov_b32_e32 v21, s25 +; VI-NEXT: v_mov_b32_e32 v38, s58 +; VI-NEXT: v_mov_b32_e32 v35, s57 +; VI-NEXT: v_mov_b32_e32 v32, s46 +; VI-NEXT: v_mov_b32_e32 v29, s43 +; VI-NEXT: v_mov_b32_e32 v6, s41 +; VI-NEXT: v_mov_b32_e32 v48, s56 +; VI-NEXT: v_mov_b32_e32 v39, s47 +; VI-NEXT: v_mov_b32_e32 v37, s45 +; VI-NEXT: v_mov_b32_e32 v36, s44 +; VI-NEXT: v_mov_b32_e32 v34, s42 +; VI-NEXT: v_mov_b32_e32 v33, s40 +; VI-NEXT: v_mov_b32_e32 v31, s29 +; VI-NEXT: v_mov_b32_e32 v30, s28 +; VI-NEXT: v_mov_b32_e32 v28, s27 +; VI-NEXT: v_mov_b32_e32 v27, s26 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v3, s8 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v1, s12 +; VI-NEXT: .LBB61_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: v_or_b32_sdwa v22, v22, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v12, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v39 +; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v38 +; VI-NEXT: v_or_b32_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-NEXT: v_or_b32_sdwa v5, v23, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v13, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v36 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v35 +; VI-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v34 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v4, v24, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v33 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v32 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v10, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v31 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v30 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v40i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB61_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s29, s25, 8 +; GFX9-NEXT: s_lshr_b32 s28, s24, 16 +; GFX9-NEXT: s_lshr_b32 s40, s24, 8 +; GFX9-NEXT: s_lshr_b32 s41, s23, 24 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 8 +; GFX9-NEXT: s_lshr_b32 s43, s22, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 8 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: s_lshr_b32 s47, s21, 16 +; GFX9-NEXT: s_lshr_b32 s57, s21, 8 +; GFX9-NEXT: s_lshr_b32 s56, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s20, 8 +; GFX9-NEXT: s_lshr_b32 s59, s19, 24 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s62, s19, 8 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 8 +; GFX9-NEXT: s_lshr_b32 s72, s17, 24 +; GFX9-NEXT: s_lshr_b32 s73, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s17, 8 +; GFX9-NEXT: s_lshr_b32 s74, s16, 16 +; GFX9-NEXT: s_lshr_b32 s76, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB61_4 +; GFX9-NEXT: .LBB61_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v1, 0x200 +; GFX9-NEXT: v_pk_add_f16 v10, s17, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s16, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s19, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s18, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s21, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s20, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s23, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s22, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s25, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s24, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; GFX9-NEXT: s_branch .LBB61_5 +; GFX9-NEXT: .LBB61_3: +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: s_branch .LBB61_2 +; GFX9-NEXT: .LBB61_4: +; GFX9-NEXT: v_mov_b32_e32 v9, s16 +; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: v_mov_b32_e32 v7, s18 +; GFX9-NEXT: v_mov_b32_e32 v8, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s20 +; GFX9-NEXT: v_mov_b32_e32 v6, s21 +; GFX9-NEXT: v_mov_b32_e32 v3, s22 +; GFX9-NEXT: v_mov_b32_e32 v4, s23 +; GFX9-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-NEXT: v_mov_b32_e32 v2, s25 +; GFX9-NEXT: v_mov_b32_e32 v39, s76 +; GFX9-NEXT: v_mov_b32_e32 v48, s74 +; GFX9-NEXT: v_mov_b32_e32 v38, s75 +; GFX9-NEXT: v_mov_b32_e32 v36, s73 +; GFX9-NEXT: v_mov_b32_e32 v37, s72 +; GFX9-NEXT: v_mov_b32_e32 v35, s63 +; GFX9-NEXT: v_mov_b32_e32 v34, s61 +; GFX9-NEXT: v_mov_b32_e32 v33, s62 +; GFX9-NEXT: v_mov_b32_e32 v31, s60 +; GFX9-NEXT: v_mov_b32_e32 v32, s59 +; GFX9-NEXT: v_mov_b32_e32 v30, s58 +; GFX9-NEXT: v_mov_b32_e32 v29, s56 +; GFX9-NEXT: v_mov_b32_e32 v28, s57 +; GFX9-NEXT: v_mov_b32_e32 v26, s47 +; GFX9-NEXT: v_mov_b32_e32 v27, s46 +; GFX9-NEXT: v_mov_b32_e32 v25, s45 +; GFX9-NEXT: v_mov_b32_e32 v24, s43 +; GFX9-NEXT: v_mov_b32_e32 v23, s44 +; GFX9-NEXT: v_mov_b32_e32 v21, s42 +; GFX9-NEXT: v_mov_b32_e32 v22, s41 +; GFX9-NEXT: v_mov_b32_e32 v20, s40 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 +; GFX9-NEXT: v_mov_b32_e32 v16, s27 +; GFX9-NEXT: v_mov_b32_e32 v17, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-NEXT: v_mov_b32_e32 v13, s8 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: .LBB61_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v10, v36, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v9, v34, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v8, v31, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v28 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v5, v24, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v20f16_to_v40i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB61_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s14 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB61_4 +; GFX11-TRUE16-NEXT: .LBB61_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v13 +; GFX11-TRUE16-NEXT: s_branch .LBB61_5 +; GFX11-TRUE16-NEXT: .LBB61_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB61_2 +; GFX11-TRUE16-NEXT: .LBB61_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s61 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s59 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s57 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s47 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s4 +; GFX11-TRUE16-NEXT: .LBB61_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v37, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v32, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v29, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v30, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v36, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v4, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[1:2], off offset:32 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v20f16_to_v40i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB61_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s14 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB61_4 +; GFX11-FAKE16-NEXT: .LBB61_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 8, v13 +; GFX11-FAKE16-NEXT: s_branch .LBB61_5 +; GFX11-FAKE16-NEXT: .LBB61_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB61_2 +; GFX11-FAKE16-NEXT: .LBB61_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s2 :: v_dual_mov_b32 v10, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v4, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s20 :: v_dual_mov_b32 v2, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s63 :: v_dual_mov_b32 v39, s61 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s62 :: v_dual_mov_b32 v37, s60 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v35, s58 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s56 :: v_dual_mov_b32 v33, s57 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s47 :: v_dual_mov_b32 v31, s46 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v29, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s44 :: v_dual_mov_b32 v27, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v25, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s28 :: v_dual_mov_b32 v23, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s27 :: v_dual_mov_b32 v21, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s25 :: v_dual_mov_b32 v19, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v7, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s15 :: v_dual_mov_b32 v15, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s4 +; GFX11-FAKE16-NEXT: .LBB61_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v39, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v34, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v32, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v29, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v9, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v10, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v10, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v24, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v37, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v19, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v3, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v4, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v17 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[1:2], off offset:32 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i8_to_v20f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v56 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v57 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v58 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v59 -; GCN-NEXT: v_or_b32_e32 v0, v0, v36 -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_or_b32_e32 v3, v3, v39 -; GCN-NEXT: v_or_b32_e32 v4, v4, v48 -; GCN-NEXT: v_or_b32_e32 v5, v5, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_or_b32_e32 v7, v7, v51 -; GCN-NEXT: v_or_b32_e32 v8, v8, v52 -; GCN-NEXT: v_or_b32_e32 v9, v9, v53 -; GCN-NEXT: v_or_b32_e32 v10, v10, v54 -; GCN-NEXT: v_or_b32_e32 v11, v11, v55 -; GCN-NEXT: v_or_b32_e32 v12, v12, v40 -; GCN-NEXT: v_or_b32_e32 v13, v13, v41 -; GCN-NEXT: v_or_b32_e32 v14, v14, v42 -; GCN-NEXT: v_or_b32_e32 v15, v15, v43 -; GCN-NEXT: v_or_b32_e32 v16, v16, v44 -; GCN-NEXT: v_or_b32_e32 v17, v17, v45 -; GCN-NEXT: v_or_b32_e32 v18, v18, v46 -; GCN-NEXT: v_or_b32_e32 v19, v19, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: .LBB19_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v59 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v47, v1 -; GCN-NEXT: v_or_b32_e32 v3, v46, v3 -; GCN-NEXT: v_or_b32_e32 v5, v45, v5 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v9, v43, v9 -; GCN-NEXT: v_or_b32_e32 v11, v42, v11 -; GCN-NEXT: v_or_b32_e32 v13, v41, v13 -; GCN-NEXT: v_or_b32_e32 v15, v40, v15 -; GCN-NEXT: v_or_b32_e32 v17, v55, v17 -; GCN-NEXT: v_or_b32_e32 v19, v54, v19 -; GCN-NEXT: v_or_b32_e32 v18, v53, v18 -; GCN-NEXT: v_or_b32_e32 v16, v52, v16 -; GCN-NEXT: v_or_b32_e32 v14, v51, v14 -; GCN-NEXT: v_or_b32_e32 v12, v50, v12 -; GCN-NEXT: v_or_b32_e32 v10, v49, v10 -; GCN-NEXT: v_or_b32_e32 v8, v48, v8 -; GCN-NEXT: v_or_b32_e32 v6, v39, v6 -; GCN-NEXT: v_or_b32_e32 v4, v38, v4 -; GCN-NEXT: v_or_b32_e32 v2, v37, v2 -; GCN-NEXT: v_or_b32_e32 v0, v36, v0 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 0x300, v1 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v20 -; GCN-NEXT: .LBB19_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v31 -; GCN-NEXT: v_mov_b32_e32 v2, v23 -; GCN-NEXT: v_mov_b32_e32 v4, v35 -; GCN-NEXT: v_mov_b32_e32 v6, v27 -; GCN-NEXT: v_mov_b32_e32 v8, v33 -; GCN-NEXT: v_mov_b32_e32 v10, v21 -; GCN-NEXT: v_mov_b32_e32 v12, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v29 -; GCN-NEXT: v_mov_b32_e32 v16, v32 -; GCN-NEXT: v_mov_b32_e32 v18, v34 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40i8_to_v20f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v4 +; SI-NEXT: v_mov_b32_e32 v31, v2 +; SI-NEXT: v_mov_b32_e32 v35, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v25 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v0 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v33 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v6, v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v6, v6, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v6, v6, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v6, v6, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v50 +; SI-NEXT: v_or_b32_e32 v6, v6, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v51 +; SI-NEXT: v_or_b32_e32 v6, v6, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 +; SI-NEXT: v_or_b32_e32 v6, v6, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_or_b32_e32 v4, v4, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: .LBB62_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: .LBB62_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v6, v27 +; SI-NEXT: v_mov_b32_e32 v8, v33 +; SI-NEXT: v_mov_b32_e32 v10, v21 +; SI-NEXT: v_mov_b32_e32 v12, v25 +; SI-NEXT: v_mov_b32_e32 v14, v29 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: v_mov_b32_e32 v18, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i8_to_v20f16: ; VI: ; %bb.0: @@ -8140,7 +26119,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB62_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v36, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v32, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -8213,9 +26192,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: .LBB19_2: ; %Flow +; VI-NEXT: .LBB62_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_4 +; VI-NEXT: s_cbranch_execz .LBB62_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v55 ; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -8291,7 +26270,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v7, v12, v7 ; VI-NEXT: v_or_b32_e32 v8, v11, v8 ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: .LBB19_4: ; %end +; VI-NEXT: .LBB62_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -8363,7 +26342,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB62_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v36, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -8437,9 +26416,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: .LBB19_2: ; %Flow +; GFX9-NEXT: .LBB62_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_4 +; GFX9-NEXT: s_cbranch_execz .LBB62_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v42 @@ -8514,7 +26493,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v7, v12, v7, s6 ; GFX9-NEXT: v_perm_b32 v8, v11, v8, s6 ; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 -; GFX9-NEXT: .LBB19_4: ; %end +; GFX9-NEXT: .LBB62_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -8590,15 +26569,15 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v37 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB62_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_4 -; GFX11-TRUE16-NEXT: .LBB19_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB62_4 +; GFX11-TRUE16-NEXT: .LBB62_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB19_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB62_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v21.h @@ -8681,8 +26660,8 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB19_2 -; GFX11-TRUE16-NEXT: .LBB19_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2 +; GFX11-TRUE16-NEXT: .LBB62_4: ; %cmp.true ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3 @@ -8817,15 +26796,15 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB62_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_4 -; GFX11-FAKE16-NEXT: .LBB19_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB62_4 +; GFX11-FAKE16-NEXT: .LBB62_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB19_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB62_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v35 @@ -8918,8 +26897,8 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB19_2 -; GFX11-FAKE16-NEXT: .LBB19_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB62_2 +; GFX11-FAKE16-NEXT: .LBB62_4: ; %cmp.true ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v68, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v66, 3 @@ -9034,167 +27013,1284 @@ end: ret <20 x half> %phi } +define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i8_to_v20f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_readfirstlane_b32 s62, v25 +; SI-NEXT: v_readfirstlane_b32 s63, v24 +; SI-NEXT: v_readfirstlane_b32 s60, v23 +; SI-NEXT: v_readfirstlane_b32 s61, v22 +; SI-NEXT: v_readfirstlane_b32 s58, v21 +; SI-NEXT: v_readfirstlane_b32 s59, v20 +; SI-NEXT: v_readfirstlane_b32 s56, v19 +; SI-NEXT: v_readfirstlane_b32 s57, v18 +; SI-NEXT: v_readfirstlane_b32 s46, v17 +; SI-NEXT: v_readfirstlane_b32 s47, v16 +; SI-NEXT: v_readfirstlane_b32 s44, v15 +; SI-NEXT: v_readfirstlane_b32 s45, v14 +; SI-NEXT: v_readfirstlane_b32 s42, v13 +; SI-NEXT: v_readfirstlane_b32 s43, v12 +; SI-NEXT: v_readfirstlane_b32 s15, v11 +; SI-NEXT: v_readfirstlane_b32 s41, v10 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v6 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s40, v4 +; SI-NEXT: v_readfirstlane_b32 s10, v3 +; SI-NEXT: v_readfirstlane_b32 s13, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s9, v0 +; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s5, s12, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s5, s15, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_lshl_b32 s5, s56, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_and_b32 s4, s59, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s61, 0xff +; SI-NEXT: s_lshl_b32 s5, s60, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_and_b32 s4, s63, 0xff +; SI-NEXT: s_lshl_b32 s5, s62, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: .LBB63_2: ; %cmp.true +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: s_and_b32 s11, s40, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_or_b32 s6, s6, s11 +; SI-NEXT: s_and_b32 s11, s13, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 8 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s28, 0xff +; SI-NEXT: s_lshl_b32 s11, s29, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s63, s63, 3 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s11, s26, 0xff +; SI-NEXT: s_lshl_b32 s13, s27, 8 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s4, s63, 0xff +; SI-NEXT: s_lshl_b32 s5, s62, 8 +; SI-NEXT: s_add_i32 s61, s61, 3 +; SI-NEXT: s_add_i32 s59, s59, 3 +; SI-NEXT: s_add_i32 s57, s57, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_or_b32 s11, s13, s11 +; SI-NEXT: s_and_b32 s13, s24, 0xff +; SI-NEXT: s_lshl_b32 s14, s25, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s61, 0xff +; SI-NEXT: s_lshl_b32 s60, s60, 8 +; SI-NEXT: s_and_b32 s59, s59, 0xff +; SI-NEXT: s_lshl_b32 s58, s58, 8 +; SI-NEXT: s_and_b32 s57, s57, 0xff +; SI-NEXT: s_lshl_b32 s56, s56, 8 +; SI-NEXT: s_and_b32 s47, s47, 0xff +; SI-NEXT: s_lshl_b32 s46, s46, 8 +; SI-NEXT: s_and_b32 s45, s45, 0xff +; SI-NEXT: s_lshl_b32 s44, s44, 8 +; SI-NEXT: s_and_b32 s43, s43, 0xff +; SI-NEXT: s_lshl_b32 s42, s42, 8 +; SI-NEXT: s_and_b32 s41, s41, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 8 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s22, 0xff +; SI-NEXT: s_lshl_b32 s22, s23, 8 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s21, s21, 8 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s5, s60, s5 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_or_b32 s42, s42, s43 +; SI-NEXT: s_or_b32 s15, s15, s41 +; SI-NEXT: s_or_b32 s14, s22, s14 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_addk_i32 s58, 0x300 +; SI-NEXT: s_addk_i32 s56, 0x300 +; SI-NEXT: s_addk_i32 s46, 0x300 +; SI-NEXT: s_addk_i32 s44, 0x300 +; SI-NEXT: s_addk_i32 s42, 0x300 +; SI-NEXT: s_addk_i32 s15, 0x300 +; SI-NEXT: s_addk_i32 s12, 0x300 +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_addk_i32 s14, 0x300 +; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_addk_i32 s18, 0x300 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB63_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_branch .LBB63_2 +; +; VI-LABEL: bitcast_v40i8_to_v20f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v31, v13 +; VI-NEXT: v_mov_b32_e32 v36, v12 +; VI-NEXT: v_mov_b32_e32 v29, v10 +; VI-NEXT: v_mov_b32_e32 v33, v9 +; VI-NEXT: v_mov_b32_e32 v27, v8 +; VI-NEXT: v_mov_b32_e32 v38, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v30, v4 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_mov_b32_e32 v32, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: s_cbranch_scc0 .LBB63_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_e32 v3, s7, v0 +; VI-NEXT: v_or_b32_sdwa v0, v37, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; VI-NEXT: v_or_b32_sdwa v0, v38, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; VI-NEXT: v_or_b32_sdwa v0, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v28, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB63_3 +; VI-NEXT: .LBB63_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v30 +; VI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v27 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v34 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v36 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s10, s29, 8 +; VI-NEXT: s_and_b32 s11, s28, 0xff +; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; VI-NEXT: v_or_b32_e32 v3, v3, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v33 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s25, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s24, 0xff +; VI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; VI-NEXT: v_or_b32_e32 v4, v4, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v17 +; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v31 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v28 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s11, s20, 0xff +; VI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; VI-NEXT: v_or_b32_e32 v5, v5, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: v_or_b32_sdwa v17, v49, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s8, s11 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 +; VI-NEXT: s_or_b32 s7, s7, s11 +; VI-NEXT: s_and_b32 s13, s18, 0xff +; VI-NEXT: v_or_b32_e32 v2, v2, v7 +; VI-NEXT: s_lshl_b32 s6, s19, 24 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_and_b32 s12, s22, 0xff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 +; VI-NEXT: s_and_b32 s11, s26, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s8, 0xffff +; VI-NEXT: s_lshl_b32 s8, s12, 16 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v37 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v38 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v22 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s8, s11, 16 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 +; VI-NEXT: v_or_b32_sdwa v20, v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v16, v48, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v15, v39, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v14, v26, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s9, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v20 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s10, 0xffff +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s7, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: .LBB63_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB63_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB63_2 +; +; GFX9-LABEL: bitcast_v40i8_to_v20f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v30, v10 +; GFX9-NEXT: v_mov_b32_e32 v27, v8 +; GFX9-NEXT: v_mov_b32_e32 v28, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v4 +; GFX9-NEXT: v_mov_b32_e32 v31, v2 +; GFX9-NEXT: v_mov_b32_e32 v32, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v35, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX9-NEXT: s_cbranch_scc0 .LBB63_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, s7, v1 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB63_3 +; GFX9-NEXT: .LBB63_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v28 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v27 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v31 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v3, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: s_or_b32 s10, s11, s10 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v3, v26, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: v_mov_b32_e32 v13, 0xffff +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v13, s4, v13 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v4, v12, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v5, v11, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v10, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v9, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: .LBB63_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB63_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB63_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v20f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB63_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v23 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v7, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v12, 16, v13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB63_3 +; GFX11-TRUE16-NEXT: .LBB63_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v38 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v32 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v17 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v12, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v10, 16, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v5, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v11, 16, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v9, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: .LBB63_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB63_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB63_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v20f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB63_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v23 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v2, 0xffff, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v24 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v6, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v6, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v9, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v12, 16, v13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB63_3 +; GFX11-FAKE16-NEXT: .LBB63_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v27 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v36, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v35, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v33, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v23 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v18 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v25 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v34, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v19, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v37, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v32, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v22, v4 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v31, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v10, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v11, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB63_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB63_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB63_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f16_to_v5f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v18 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB20_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB20_4 -; GCN-NEXT: .LBB20_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB20_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34 -; GCN-NEXT: v_or_b32_e32 v0, v35, v0 -; GCN-NEXT: v_or_b32_e32 v1, v33, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v19 -; GCN-NEXT: v_or_b32_e32 v2, v29, v2 -; GCN-NEXT: v_or_b32_e32 v3, v27, v3 -; GCN-NEXT: v_or_b32_e32 v4, v25, v4 -; GCN-NEXT: v_or_b32_e32 v5, v23, v5 -; GCN-NEXT: v_or_b32_e32 v6, v21, v6 -; GCN-NEXT: v_or_b32_e32 v7, v20, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v16, v9 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: .LBB20_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v16, v17 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20f16_to_v5f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_4 +; SI-NEXT: .LBB64_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB64_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v31, v3 +; SI-NEXT: v_or_b32_e32 v4, v29, v4 +; SI-NEXT: v_or_b32_e32 v5, v27, v5 +; SI-NEXT: v_or_b32_e32 v6, v25, v6 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 +; SI-NEXT: v_or_b32_e32 v8, v21, v8 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_2 +; SI-NEXT: .LBB64_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f16_to_v5f64: ; VI: ; %bb.0: @@ -9203,7 +28299,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB64_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v10, 0x200 ; VI-NEXT: v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -9236,7 +28332,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v11 ; VI-NEXT: v_or_b32_e32 v0, v0, v10 -; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: .LBB64_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9247,7 +28343,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB64_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] @@ -9260,7 +28356,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: .LBB64_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9272,7 +28368,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: s_cbranch_execz .LBB64_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] @@ -9284,7 +28380,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: .LBB64_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9304,122 +28400,434 @@ end: ret <5 x double> %phi } +define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f16_to_v5f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_or_b32_e32 v4, v26, v4 +; SI-NEXT: v_or_b32_e32 v5, v24, v5 +; SI-NEXT: v_or_b32_e32 v6, v22, v6 +; SI-NEXT: v_or_b32_e32 v7, v20, v7 +; SI-NEXT: v_or_b32_e32 v8, v18, v8 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: .LBB65_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: .LBB65_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB65_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB65_2 +; +; VI-LABEL: bitcast_v20f16_to_v5f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB65_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB65_4 +; VI-NEXT: .LBB65_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_mov_b32_e32 v10, s4 +; VI-NEXT: v_add_f16_sdwa v10, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v10 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB65_3: +; VI-NEXT: s_branch .LBB65_2 +; VI-NEXT: .LBB65_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v5f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB65_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB65_4 +; GFX9-NEXT: .LBB65_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB65_3: +; GFX9-NEXT: s_branch .LBB65_2 +; GFX9-NEXT: .LBB65_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v5f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB65_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB65_4 +; GFX11-NEXT: .LBB65_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB65_3: +; GFX11-NEXT: s_branch .LBB65_2 +; GFX11-NEXT: .LBB65_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} + define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v5f64_to_v20f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v24 -; GCN-NEXT: v_mov_b32_e32 v1, v29 -; GCN-NEXT: v_mov_b32_e32 v2, v20 -; GCN-NEXT: v_mov_b32_e32 v3, v28 -; GCN-NEXT: v_mov_b32_e32 v4, v21 -; GCN-NEXT: v_mov_b32_e32 v5, v27 -; GCN-NEXT: v_mov_b32_e32 v6, v22 -; GCN-NEXT: v_mov_b32_e32 v7, v26 -; GCN-NEXT: v_mov_b32_e32 v8, v23 -; GCN-NEXT: v_mov_b32_e32 v9, v25 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5f64_to_v20f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: .LBB66_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: .LBB66_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v28 +; SI-NEXT: v_mov_b32_e32 v1, v29 +; SI-NEXT: v_mov_b32_e32 v2, v27 +; SI-NEXT: v_mov_b32_e32 v3, v26 +; SI-NEXT: v_mov_b32_e32 v4, v25 +; SI-NEXT: v_mov_b32_e32 v5, v23 +; SI-NEXT: v_mov_b32_e32 v6, v24 +; SI-NEXT: v_mov_b32_e32 v7, v21 +; SI-NEXT: v_mov_b32_e32 v8, v22 +; SI-NEXT: v_mov_b32_e32 v9, v20 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f64_to_v20f16: ; VI: ; %bb.0: @@ -9428,14 +28836,14 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB66_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: .LBB66_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9446,14 +28854,14 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB66_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB21_2: ; %end +; GFX9-NEXT: .LBB66_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9465,14 +28873,14 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB21_2: ; %end +; GFX11-NEXT: .LBB66_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9492,167 +28900,385 @@ end: ret <20 x half> %phi } +define inreg <20 x half> @bitcast_v5f64_to_v20f16_scalar(<5 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f64_to_v20f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB67_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB67_3 +; SI-NEXT: .LBB67_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB67_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_branch .LBB67_2 +; +; VI-LABEL: bitcast_v5f64_to_v20f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB67_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB67_4 +; VI-NEXT: .LBB67_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB67_3: +; VI-NEXT: s_branch .LBB67_2 +; VI-NEXT: .LBB67_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v20f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB67_4 +; GFX9-NEXT: .LBB67_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB67_3: +; GFX9-NEXT: s_branch .LBB67_2 +; GFX9-NEXT: .LBB67_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v20f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB67_4 +; GFX11-NEXT: .LBB67_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB67_3: +; GFX11-NEXT: s_branch .LBB67_2 +; GFX11-NEXT: .LBB67_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f16_to_v5i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v18 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_4 -; GCN-NEXT: .LBB22_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB22_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34 -; GCN-NEXT: v_or_b32_e32 v0, v35, v0 -; GCN-NEXT: v_or_b32_e32 v1, v33, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v19 -; GCN-NEXT: v_or_b32_e32 v2, v29, v2 -; GCN-NEXT: v_or_b32_e32 v3, v27, v3 -; GCN-NEXT: v_or_b32_e32 v4, v25, v4 -; GCN-NEXT: v_or_b32_e32 v5, v23, v5 -; GCN-NEXT: v_or_b32_e32 v6, v21, v6 -; GCN-NEXT: v_or_b32_e32 v7, v20, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v16, v9 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: .LBB22_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v16, v17 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20f16_to_v5i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB68_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB68_4 +; SI-NEXT: .LBB68_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB68_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v31, v3 +; SI-NEXT: v_or_b32_e32 v4, v29, v4 +; SI-NEXT: v_or_b32_e32 v5, v27, v5 +; SI-NEXT: v_or_b32_e32 v6, v25, v6 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 +; SI-NEXT: v_or_b32_e32 v8, v21, v8 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_2 +; SI-NEXT: .LBB68_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f16_to_v5i64: ; VI: ; %bb.0: @@ -9661,7 +29287,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB68_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v10, 0x200 ; VI-NEXT: v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -9694,7 +29320,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v11 ; VI-NEXT: v_or_b32_e32 v0, v0, v10 -; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: .LBB68_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9705,7 +29331,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB68_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] @@ -9718,7 +29344,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: .LBB68_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9730,7 +29356,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: s_cbranch_execz .LBB68_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] @@ -9742,7 +29368,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: .LBB68_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9762,136 +29388,448 @@ end: ret <5 x i64> %phi } +define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f16_to_v5i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_or_b32_e32 v4, v26, v4 +; SI-NEXT: v_or_b32_e32 v5, v24, v5 +; SI-NEXT: v_or_b32_e32 v6, v22, v6 +; SI-NEXT: v_or_b32_e32 v7, v20, v7 +; SI-NEXT: v_or_b32_e32 v8, v18, v8 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: .LBB69_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: .LBB69_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB69_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB69_2 +; +; VI-LABEL: bitcast_v20f16_to_v5i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB69_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB69_4 +; VI-NEXT: .LBB69_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_mov_b32_e32 v10, s4 +; VI-NEXT: v_add_f16_sdwa v10, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v10 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB69_3: +; VI-NEXT: s_branch .LBB69_2 +; VI-NEXT: .LBB69_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v5i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB69_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB69_4 +; GFX9-NEXT: .LBB69_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB69_3: +; GFX9-NEXT: s_branch .LBB69_2 +; GFX9-NEXT: .LBB69_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v5i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB69_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB69_4 +; GFX11-NEXT: .LBB69_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB69_3: +; GFX11-NEXT: s_branch .LBB69_2 +; GFX11-NEXT: .LBB69_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v5i64_to_v20f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, v9 -; GCN-NEXT: v_mov_b32_e32 v21, v8 -; GCN-NEXT: v_mov_b32_e32 v24, v7 -; GCN-NEXT: v_mov_b32_e32 v23, v6 -; GCN-NEXT: v_mov_b32_e32 v26, v5 -; GCN-NEXT: v_mov_b32_e32 v25, v4 -; GCN-NEXT: v_mov_b32_e32 v28, v3 -; GCN-NEXT: v_mov_b32_e32 v27, v2 -; GCN-NEXT: v_mov_b32_e32 v29, v1 -; GCN-NEXT: v_mov_b32_e32 v20, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_4 -; GCN-NEXT: .LBB23_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB23_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: .LBB23_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v29, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v22, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5i64_to_v20f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v21, v9 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v23, v7 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v25, v5 +; SI-NEXT: v_mov_b32_e32 v24, v4 +; SI-NEXT: v_mov_b32_e32 v27, v3 +; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v29, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_4 +; SI-NEXT: .LBB70_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB70_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_2 +; SI-NEXT: .LBB70_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v21, vcc +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i64_to_v20f16: ; VI: ; %bb.0: @@ -9900,7 +29838,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB70_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc @@ -9912,7 +29850,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB70_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9923,7 +29861,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB70_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc @@ -9935,7 +29873,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: .LBB70_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9947,7 +29885,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9962,7 +29900,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB23_2: ; %end +; GFX11-NEXT: .LBB70_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9982,305 +29920,528 @@ end: ret <20 x half> %phi } +define inreg <20 x half> @bitcast_v5i64_to_v20f16_scalar(<5 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i64_to_v20f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB71_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB71_3 +; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_add_u32 s8, s18, 3 +; SI-NEXT: s_addc_u32 s9, s19, 0 +; SI-NEXT: s_lshr_b32 s10, s8, 16 +; SI-NEXT: s_lshr_b32 s11, s9, 16 +; SI-NEXT: s_add_u32 s12, s20, 3 +; SI-NEXT: s_addc_u32 s13, s21, 0 +; SI-NEXT: s_lshr_b32 s14, s12, 16 +; SI-NEXT: s_lshr_b32 s15, s13, 16 +; SI-NEXT: s_add_u32 s16, s22, 3 +; SI-NEXT: s_addc_u32 s17, s23, 0 +; SI-NEXT: s_lshr_b32 s18, s16, 16 +; SI-NEXT: s_lshr_b32 s19, s17, 16 +; SI-NEXT: s_add_u32 s20, s24, 3 +; SI-NEXT: s_addc_u32 s21, s25, 0 +; SI-NEXT: s_lshr_b32 s22, s20, 16 +; SI-NEXT: s_lshr_b32 s23, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB71_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_branch .LBB71_2 +; +; VI-LABEL: bitcast_v5i64_to_v20f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB71_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB71_3 +; VI-NEXT: .LBB71_2: ; %cmp.true +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB71_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB71_4: +; VI-NEXT: s_branch .LBB71_2 +; +; GFX9-LABEL: bitcast_v5i64_to_v20f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB71_3 +; GFX9-NEXT: .LBB71_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB71_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB71_4: +; GFX9-NEXT: s_branch .LBB71_2 +; +; GFX11-LABEL: bitcast_v5i64_to_v20f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-NEXT: .LBB71_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB71_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB71_4: +; GFX11-NEXT: s_branch .LBB71_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i8_to_v5f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v12 -; GCN-NEXT: v_mov_b32_e32 v36, v10 -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v37 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v38 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v47 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v46 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v45 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v44 -; GCN-NEXT: v_or_b32_e32 v0, v0, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v10, v10, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v12, v12, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v16, v16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v20, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v7, v50, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v9, v17, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v11, v19, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v13, v21, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v15, v23, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v17, v25, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v19, v27, v22 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: v_or_b32_e32 v4, v8, v9 -; GCN-NEXT: v_or_b32_e32 v5, v10, v11 -; GCN-NEXT: v_or_b32_e32 v6, v12, v13 -; GCN-NEXT: v_or_b32_e32 v7, v14, v15 -; GCN-NEXT: v_or_b32_e32 v8, v16, v17 -; GCN-NEXT: v_or_b32_e32 v9, v18, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v0, v51, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v52, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v53, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v54, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v55, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v10, v40, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v12, v41, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v14, v29, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v16, v42, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v43, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_or_b32_e32 v7, v50, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v17, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_or_b32_e32 v11, v19, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_or_b32_e32 v13, v21, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_or_b32_e32 v15, v23, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_or_b32_e32 v17, v25, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v20 -; GCN-NEXT: v_or_b32_e32 v19, v27, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v9, v19, v18 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40i8_to_v5f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v43, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v10 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB72_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v45, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v43, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v40 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v53, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v51, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v27 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v25, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 +; SI-NEXT: v_or_b32_e32 v8, v8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v21, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v19 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v17, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: .LBB72_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB72_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v46, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v45, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v2, v44, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v43, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v38 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v5, v54, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v53, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v51, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v27, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v25, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v8, v23, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v21, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_or_b32_e32 v9, v19, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v17, v10 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; SI-NEXT: .LBB72_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i8_to_v5f64: ; VI: ; %bb.0: @@ -10342,7 +30503,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB72_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -10415,9 +30576,9 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB72_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB72_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v31 ; VI-NEXT: v_add_u16_e32 v1, 3, v32 @@ -10491,7 +30652,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 ; VI-NEXT: v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB72_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -10565,7 +30726,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB72_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -10638,9 +30799,9 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB72_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB72_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 @@ -10714,7 +30875,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB72_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -10785,15 +30946,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_4 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_4 +; GFX11-TRUE16-NEXT: .LBB72_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB24_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB72_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l @@ -10906,8 +31067,8 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-TRUE16-NEXT: .LBB24_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2 +; GFX11-TRUE16-NEXT: .LBB72_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3 @@ -11071,15 +31232,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB72_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_4 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB72_4 +; GFX11-FAKE16-NEXT: .LBB72_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB24_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB72_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 @@ -11192,8 +31353,8 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB72_2 +; GFX11-FAKE16-NEXT: .LBB72_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -11324,227 +31485,1379 @@ end: ret <5 x double> %phi } +define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i8_to_v5f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_mov_b32_e32 v34, v14 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v31, v8 +; SI-NEXT: v_mov_b32_e32 v30, v6 +; SI-NEXT: v_mov_b32_e32 v29, v4 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v25 +; SI-NEXT: s_cbranch_scc0 .LBB73_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v3, s7, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_or_b32_e32 v0, v0, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v0, v0, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v19 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB73_3 +; SI-NEXT: .LBB73_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v31 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB73_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB73_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB73_2 +; +; VI-LABEL: bitcast_v40i8_to_v5f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: v_mov_b32_e32 v36, v14 +; VI-NEXT: v_mov_b32_e32 v37, v13 +; VI-NEXT: v_mov_b32_e32 v38, v12 +; VI-NEXT: v_mov_b32_e32 v29, v10 +; VI-NEXT: v_mov_b32_e32 v28, v9 +; VI-NEXT: v_mov_b32_e32 v27, v8 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v31, v5 +; VI-NEXT: v_mov_b32_e32 v30, v4 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; VI-NEXT: s_cbranch_scc0 .LBB73_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; VI-NEXT: v_or_b32_e32 v3, s7, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; VI-NEXT: v_or_b32_sdwa v0, v32, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; VI-NEXT: v_or_b32_sdwa v0, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v36, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: v_or_b32_sdwa v0, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; VI-NEXT: v_or_b32_sdwa v0, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB73_3 +; VI-NEXT: .LBB73_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s19, 24 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v20 +; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v38 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v33 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s28, 0xff +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v35 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; VI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 +; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 +; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v37 +; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v28 +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v31 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v34 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_sdwa v18, v50, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_sdwa v16, v49, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_or_b32_sdwa v14, v48, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_or_b32_sdwa v12, v39, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_or_b32_sdwa v10, v26, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v3, v3, v20 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v5, v5, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 +; VI-NEXT: v_or_b32_e32 v6, v6, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v12 +; VI-NEXT: v_or_b32_e32 v7, v7, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v7, v7, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v8, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB73_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB73_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB73_2 +; +; GFX9-LABEL: bitcast_v40i8_to_v5f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v14 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v32, v10 +; GFX9-NEXT: v_mov_b32_e32 v31, v8 +; GFX9-NEXT: v_mov_b32_e32 v30, v6 +; GFX9-NEXT: v_mov_b32_e32 v29, v4 +; GFX9-NEXT: v_mov_b32_e32 v27, v2 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v35, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: s_cbranch_scc0 .LBB73_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s7, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB73_3 +; GFX9-NEXT: .LBB73_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v28 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v34 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB73_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB73_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB73_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v5f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB73_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB73_3 +; GFX11-TRUE16-NEXT: .LBB73_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v37 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v35 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v33 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB73_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB73_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB73_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v5f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v6 :: v_dual_mov_b32 v24, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v2 :: v_dual_mov_b32 v26, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB73_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v27 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v37 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v30 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v24 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB73_3 +; GFX11-FAKE16-NEXT: .LBB73_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v23 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s24, 0xff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v34, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v35, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v31, v6 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v37, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v36, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v24 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v33, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v22, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v17, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v19, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v32, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v21, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-FAKE16-NEXT: .LBB73_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB73_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB73_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} + define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v5f64_to_v40i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v16, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v17, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v18, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v19, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v30, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v16, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v17, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v18, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v19, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v30, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v35 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v48 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v38 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v37 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v34 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v17 -; GCN-NEXT: v_or_b32_e32 v49, v49, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 8, v28 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; GCN-NEXT: v_or_b32_e32 v48, v51, v48 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 20, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GCN-NEXT: v_or_b32_e32 v3, v3, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 24, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v23 -; GCN-NEXT: v_or_b32_e32 v4, v4, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 28, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GCN-NEXT: v_or_b32_e32 v5, v5, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 32, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v20 -; GCN-NEXT: v_or_b32_e32 v6, v6, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v7, v7, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v24 -; GCN-NEXT: v_or_b32_e32 v9, v9, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v10, v10, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v22, v32, v35 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v25, v33, v39 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v29, v29, v30 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v30, v34, v36 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v18, v26, v27 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v15, v23, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v12, v20, v13 -; GCN-NEXT: v_or_b32_e32 v13, v21, v22 -; GCN-NEXT: v_or_b32_e32 v16, v24, v25 -; GCN-NEXT: v_or_b32_e32 v3, v3, v29 -; GCN-NEXT: v_or_b32_e32 v4, v4, v30 -; GCN-NEXT: v_or_b32_e32 v5, v5, v17 -; GCN-NEXT: v_or_b32_e32 v6, v6, v18 -; GCN-NEXT: v_or_b32_e32 v7, v7, v14 -; GCN-NEXT: v_or_b32_e32 v8, v8, v15 -; GCN-NEXT: v_or_b32_e32 v9, v9, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v12 -; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v28, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5f64_to_v40i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: .LBB74_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: .LBB74_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v32, v32, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f64_to_v40i8: ; VI: ; %bb.0: @@ -11582,7 +32895,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB74_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -11614,9 +32927,9 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: .LBB74_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB74_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; VI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 @@ -11653,7 +32966,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: .LBB74_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 @@ -11763,7 +33076,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB74_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -11795,9 +33108,9 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB25_2: ; %Flow +; GFX9-NEXT: .LBB74_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 +; GFX9-NEXT: s_cbranch_execz .LBB74_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; GFX9-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 @@ -11834,7 +33147,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: .LBB74_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 @@ -11926,7 +33239,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -11948,9 +33261,9 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB25_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB74_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 @@ -11977,7 +33290,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB25_4: ; %end +; GFX11-TRUE16-NEXT: .LBB74_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -12115,7 +33428,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB74_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -12147,9 +33460,9 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB25_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB74_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB74_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 @@ -12186,7 +33499,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB25_4: ; %end +; GFX11-FAKE16-NEXT: .LBB74_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -12249,40 +33562,1234 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v11 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v33, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v25, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v20, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v33, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v25, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v20, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v15 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + +define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f64_to_v40i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB75_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s24 +; SI-NEXT: v_alignbit_b32 v2, s25, v1, 24 +; SI-NEXT: v_alignbit_b32 v11, s25, v1, 16 +; SI-NEXT: v_alignbit_b32 v12, s25, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s22 +; SI-NEXT: v_alignbit_b32 v4, s23, v1, 24 +; SI-NEXT: v_alignbit_b32 v13, s23, v1, 16 +; SI-NEXT: v_alignbit_b32 v14, s23, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s20 +; SI-NEXT: v_alignbit_b32 v6, s21, v1, 24 +; SI-NEXT: v_alignbit_b32 v15, s21, v1, 16 +; SI-NEXT: v_alignbit_b32 v16, s21, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s18 +; SI-NEXT: v_alignbit_b32 v8, s19, v1, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v1, 16 +; SI-NEXT: v_alignbit_b32 v17, s19, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: v_alignbit_b32 v18, s17, v1, 24 +; SI-NEXT: v_alignbit_b32 v19, s17, v1, 16 +; SI-NEXT: v_alignbit_b32 v20, s17, v1, 8 +; SI-NEXT: s_lshr_b32 s6, s25, 24 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 8 +; SI-NEXT: s_lshr_b32 s9, s23, 24 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 8 +; SI-NEXT: s_lshr_b32 s12, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 8 +; SI-NEXT: s_lshr_b32 s15, s19, 24 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 8 +; SI-NEXT: s_lshr_b32 s28, s17, 24 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB75_4 +; SI-NEXT: .LBB75_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[9:10], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[5:6], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[1:2], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[22:23], 1.0 +; SI-NEXT: v_readfirstlane_b32 s25, v2 +; SI-NEXT: v_readfirstlane_b32 s23, v4 +; SI-NEXT: v_readfirstlane_b32 s21, v6 +; SI-NEXT: v_readfirstlane_b32 s19, v8 +; SI-NEXT: v_readfirstlane_b32 s17, v10 +; SI-NEXT: v_alignbit_b32 v2, s25, v1, 24 +; SI-NEXT: v_alignbit_b32 v11, s25, v1, 16 +; SI-NEXT: v_alignbit_b32 v12, s25, v1, 8 +; SI-NEXT: v_alignbit_b32 v4, s23, v3, 24 +; SI-NEXT: v_alignbit_b32 v13, s23, v3, 16 +; SI-NEXT: v_alignbit_b32 v14, s23, v3, 8 +; SI-NEXT: v_alignbit_b32 v6, s21, v5, 24 +; SI-NEXT: v_alignbit_b32 v15, s21, v5, 16 +; SI-NEXT: v_alignbit_b32 v16, s21, v5, 8 +; SI-NEXT: v_alignbit_b32 v8, s19, v7, 24 +; SI-NEXT: s_lshr_b32 s6, s25, 24 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 8 +; SI-NEXT: s_lshr_b32 s9, s23, 24 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 8 +; SI-NEXT: s_lshr_b32 s12, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 8 +; SI-NEXT: s_lshr_b32 s15, s19, 24 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 8 +; SI-NEXT: s_lshr_b32 s28, s17, 24 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 8 +; SI-NEXT: v_alignbit_b32 v10, s19, v7, 16 +; SI-NEXT: v_alignbit_b32 v17, s19, v7, 8 +; SI-NEXT: v_alignbit_b32 v18, s17, v9, 24 +; SI-NEXT: v_alignbit_b32 v19, s17, v9, 16 +; SI-NEXT: v_alignbit_b32 v20, s17, v9, 8 +; SI-NEXT: s_branch .LBB75_5 +; SI-NEXT: .LBB75_3: +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB75_2 +; SI-NEXT: .LBB75_4: +; SI-NEXT: v_mov_b32_e32 v1, s24 +; SI-NEXT: v_mov_b32_e32 v3, s22 +; SI-NEXT: v_mov_b32_e32 v5, s20 +; SI-NEXT: v_mov_b32_e32 v7, s18 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: .LBB75_5: ; %end +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s29, 0xff +; SI-NEXT: v_or_b32_e32 v9, v9, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s28, 24 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v9, v9, v18 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v18, s4 +; SI-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v17 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v10 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s15, s15, 24 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s15, s5 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v16 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s14, 8 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v15 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s13, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s12, s12, 24 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v14 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s5, s11, 8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v13 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s10, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s9, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v11 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s7, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v40i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB75_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s28, s25, 8 +; VI-NEXT: s_lshr_b32 s60, s24, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 8 +; VI-NEXT: s_lshr_b32 s29, s23, 24 +; VI-NEXT: s_lshr_b32 s40, s23, 16 +; VI-NEXT: s_lshr_b32 s41, s23, 8 +; VI-NEXT: s_lshr_b32 s62, s22, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 8 +; VI-NEXT: s_lshr_b32 s42, s21, 24 +; VI-NEXT: s_lshr_b32 s43, s21, 16 +; VI-NEXT: s_lshr_b32 s44, s21, 8 +; VI-NEXT: s_lshr_b32 s72, s20, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 8 +; VI-NEXT: s_lshr_b32 s45, s19, 24 +; VI-NEXT: s_lshr_b32 s46, s19, 16 +; VI-NEXT: s_lshr_b32 s47, s19, 8 +; VI-NEXT: s_lshr_b32 s74, s18, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 8 +; VI-NEXT: s_lshr_b32 s56, s17, 24 +; VI-NEXT: s_lshr_b32 s57, s17, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 8 +; VI-NEXT: s_lshr_b32 s76, s16, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 8 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB75_4 +; VI-NEXT: .LBB75_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[1:2], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[3:4], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[5:6], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[7:8], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[9:10], s[16:17], 1.0 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[7:8] +; VI-NEXT: v_readfirstlane_b32 s17, v10 +; VI-NEXT: v_readfirstlane_b32 s19, v8 +; VI-NEXT: v_readfirstlane_b32 s21, v6 +; VI-NEXT: v_readfirstlane_b32 s23, v4 +; VI-NEXT: v_readfirstlane_b32 s25, v2 +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s28, s25, 8 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 +; VI-NEXT: s_lshr_b32 s29, s23, 24 +; VI-NEXT: s_lshr_b32 s40, s23, 16 +; VI-NEXT: s_lshr_b32 s41, s23, 8 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v3 +; VI-NEXT: s_lshr_b32 s42, s21, 24 +; VI-NEXT: s_lshr_b32 s43, s21, 16 +; VI-NEXT: s_lshr_b32 s44, s21, 8 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v5 +; VI-NEXT: s_lshr_b32 s45, s19, 24 +; VI-NEXT: s_lshr_b32 s46, s19, 16 +; VI-NEXT: s_lshr_b32 s47, s19, 8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v7 +; VI-NEXT: s_lshr_b32 s56, s17, 24 +; VI-NEXT: s_lshr_b32 s57, s17, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v9 +; VI-NEXT: s_branch .LBB75_5 +; VI-NEXT: .LBB75_3: +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: s_branch .LBB75_2 +; VI-NEXT: .LBB75_4: +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: v_mov_b32_e32 v7, s18 +; VI-NEXT: v_mov_b32_e32 v5, s20 +; VI-NEXT: v_mov_b32_e32 v3, s22 +; VI-NEXT: v_mov_b32_e32 v1, s24 +; VI-NEXT: v_mov_b32_e32 v25, s76 +; VI-NEXT: v_mov_b32_e32 v26, s75 +; VI-NEXT: v_mov_b32_e32 v23, s74 +; VI-NEXT: v_mov_b32_e32 v24, s73 +; VI-NEXT: v_mov_b32_e32 v21, s72 +; VI-NEXT: v_mov_b32_e32 v22, s63 +; VI-NEXT: v_mov_b32_e32 v19, s62 +; VI-NEXT: v_mov_b32_e32 v20, s61 +; VI-NEXT: v_mov_b32_e32 v17, s60 +; VI-NEXT: v_mov_b32_e32 v18, s59 +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v13, s8 +; VI-NEXT: v_mov_b32_e32 v12, s10 +; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: .LBB75_5: ; %end +; VI-NEXT: s_and_b32 s4, s17, 0xff +; VI-NEXT: s_lshl_b32 s5, s58, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s57, 0xff +; VI-NEXT: s_lshl_b32 s6, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v15 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v25, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s19, 0xff +; VI-NEXT: s_lshl_b32 s5, s47, 8 +; VI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s46, 0xff +; VI-NEXT: s_lshl_b32 s6, s45, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v23, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s21, 0xff +; VI-NEXT: s_lshl_b32 s5, s44, 8 +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s43, 0xff +; VI-NEXT: s_lshl_b32 s6, s42, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v13 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s23, 0xff +; VI-NEXT: s_lshl_b32 s5, s41, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s40, 0xff +; VI-NEXT: s_lshl_b32 s6, s29, 8 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 24, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_and_b32 s4, s25, 0xff +; VI-NEXT: s_lshl_b32 s5, s28, 8 +; VI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s27, 0xff +; VI-NEXT: s_lshl_b32 s6, s26, 8 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v40i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s28, s25, 8 +; GFX9-NEXT: s_lshr_b32 s60, s24, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 8 +; GFX9-NEXT: s_lshr_b32 s29, s23, 24 +; GFX9-NEXT: s_lshr_b32 s40, s23, 16 +; GFX9-NEXT: s_lshr_b32 s41, s23, 8 +; GFX9-NEXT: s_lshr_b32 s62, s22, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 8 +; GFX9-NEXT: s_lshr_b32 s42, s21, 24 +; GFX9-NEXT: s_lshr_b32 s43, s21, 16 +; GFX9-NEXT: s_lshr_b32 s44, s21, 8 +; GFX9-NEXT: s_lshr_b32 s72, s20, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 8 +; GFX9-NEXT: s_lshr_b32 s45, s19, 24 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: s_lshr_b32 s47, s19, 8 +; GFX9-NEXT: s_lshr_b32 s74, s18, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 8 +; GFX9-NEXT: s_lshr_b32 s56, s17, 24 +; GFX9-NEXT: s_lshr_b32 s57, s17, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 8 +; GFX9-NEXT: s_lshr_b32 s76, s16, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB75_4 +; GFX9-NEXT: .LBB75_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[1:2], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[3:4], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[5:6], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[7:8], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[9:10], s[16:17], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[7:8] +; GFX9-NEXT: v_readfirstlane_b32 s17, v10 +; GFX9-NEXT: v_readfirstlane_b32 s19, v8 +; GFX9-NEXT: v_readfirstlane_b32 s21, v6 +; GFX9-NEXT: v_readfirstlane_b32 s23, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v2 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s28, s25, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v1 +; GFX9-NEXT: s_lshr_b32 s29, s23, 24 +; GFX9-NEXT: s_lshr_b32 s40, s23, 16 +; GFX9-NEXT: s_lshr_b32 s41, s23, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v3 +; GFX9-NEXT: s_lshr_b32 s42, s21, 24 +; GFX9-NEXT: s_lshr_b32 s43, s21, 16 +; GFX9-NEXT: s_lshr_b32 s44, s21, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 8, v5 +; GFX9-NEXT: s_lshr_b32 s45, s19, 24 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: s_lshr_b32 s47, s19, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v7 +; GFX9-NEXT: s_lshr_b32 s56, s17, 24 +; GFX9-NEXT: s_lshr_b32 s57, s17, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v9 +; GFX9-NEXT: s_branch .LBB75_5 +; GFX9-NEXT: .LBB75_3: +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: s_branch .LBB75_2 +; GFX9-NEXT: .LBB75_4: +; GFX9-NEXT: v_mov_b32_e32 v9, s16 +; GFX9-NEXT: v_mov_b32_e32 v7, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s22 +; GFX9-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-NEXT: v_mov_b32_e32 v25, s76 +; GFX9-NEXT: v_mov_b32_e32 v26, s75 +; GFX9-NEXT: v_mov_b32_e32 v23, s74 +; GFX9-NEXT: v_mov_b32_e32 v24, s73 +; GFX9-NEXT: v_mov_b32_e32 v21, s72 +; GFX9-NEXT: v_mov_b32_e32 v22, s63 +; GFX9-NEXT: v_mov_b32_e32 v19, s62 +; GFX9-NEXT: v_mov_b32_e32 v20, s61 +; GFX9-NEXT: v_mov_b32_e32 v17, s60 +; GFX9-NEXT: v_mov_b32_e32 v18, s59 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v13, s8 +; GFX9-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-NEXT: .LBB75_5: ; %end +; GFX9-NEXT: s_and_b32 s4, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s58, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s57, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s56, 8 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v15 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v25, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s47, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s46, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s45, 8 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v24 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s44, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s43, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s42, 8 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v13 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s41, 8 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v20 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s40, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s28, 8 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s26, 8 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v11 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v5f64_to_v40i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s45 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB75_4 +; GFX11-TRUE16-NEXT: .LBB75_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], s[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], s[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], s[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], s[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[1:2], s[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v11 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s17, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s19, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s21, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[10:11] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[2:3], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[4:5] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[12:13] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 8 +; GFX11-TRUE16-NEXT: s_branch .LBB75_5 +; GFX11-TRUE16-NEXT: .LBB75_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB75_2 +; GFX11-TRUE16-NEXT: .LBB75_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s61 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s59 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s57 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s47 +; GFX11-TRUE16-NEXT: .LBB75_5: ; %end +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v14 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s41 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s29 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v10, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v16 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v19, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_and_b32 v10, 0xff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s26 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s25 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v4, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s23 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s24 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s14 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[12:15], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[4:7], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[1:2], off offset:32 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v5f64_to_v40i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s45, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s45 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB75_4 +; GFX11-FAKE16-NEXT: .LBB75_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], s[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], s[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], s[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], s[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[1:2], s[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v11 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s17, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s19, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s21, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[10:11] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[2:3], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[4:5] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[12:13] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 8, v12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 8 +; GFX11-FAKE16-NEXT: s_branch .LBB75_5 +; GFX11-FAKE16-NEXT: .LBB75_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: s_branch .LBB75_2 +; GFX11-FAKE16-NEXT: .LBB75_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v7, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v1, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v15, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v5, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v19, s63 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s12 :: v_dual_mov_b32 v17, s61 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s62 :: v_dual_mov_b32 v13, s59 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s60 :: v_dual_mov_b32 v9, s57 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s58 :: v_dual_mov_b32 v11, s56 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s47 :: v_dual_mov_b32 v6, s46 +; GFX11-FAKE16-NEXT: .LBB75_5: ; %end +; GFX11-FAKE16-NEXT: s_and_b32 s0, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s44, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s43, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s42, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s41, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s3, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s29, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s40, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v13, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, s0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s26, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v15, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s23, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v15 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s28, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v9, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s27, 0xff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s25, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v11 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s19, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s24, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s22, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s15, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s14, 8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v10, v14 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v4, v5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, s1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[12:15], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[1:2], off offset:32 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -12302,304 +34809,310 @@ end: } define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i8_to_v5i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v12 -; GCN-NEXT: v_mov_b32_e32 v36, v10 -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v37 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v38 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v47 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v46 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v45 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v44 -; GCN-NEXT: v_or_b32_e32 v0, v0, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v10, v10, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v12, v12, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v16, v16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v20, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v7, v50, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v9, v17, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v11, v19, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v13, v21, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v15, v23, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v17, v25, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v19, v27, v22 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: v_or_b32_e32 v4, v8, v9 -; GCN-NEXT: v_or_b32_e32 v5, v10, v11 -; GCN-NEXT: v_or_b32_e32 v6, v12, v13 -; GCN-NEXT: v_or_b32_e32 v7, v14, v15 -; GCN-NEXT: v_or_b32_e32 v8, v16, v17 -; GCN-NEXT: v_or_b32_e32 v9, v18, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v0, v51, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v52, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v53, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v54, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v55, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v10, v40, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v12, v41, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v14, v29, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v16, v42, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v43, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_or_b32_e32 v7, v50, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v17, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_or_b32_e32 v11, v19, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_or_b32_e32 v13, v21, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_or_b32_e32 v15, v23, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_or_b32_e32 v17, v25, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v20 -; GCN-NEXT: v_or_b32_e32 v19, v27, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v9, v19, v18 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40i8_to_v5i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v43, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v10 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v45, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v43, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v40 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v53, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v51, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v27 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v25, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 +; SI-NEXT: v_or_b32_e32 v8, v8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v21, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v19 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v17, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: .LBB76_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v46, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v45, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v2, v44, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v43, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v38 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v5, v54, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v53, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v51, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v27, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v25, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v8, v23, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v21, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_or_b32_e32 v9, v19, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v17, v10 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; SI-NEXT: .LBB76_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i8_to_v5i64: ; VI: ; %bb.0: @@ -12661,7 +35174,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB76_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -12734,9 +35247,9 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: .LBB26_2: ; %Flow +; VI-NEXT: .LBB76_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_4 +; VI-NEXT: s_cbranch_execz .LBB76_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v31 ; VI-NEXT: v_add_u16_e32 v1, 3, v32 @@ -12810,7 +35323,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 ; VI-NEXT: v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: .LBB26_4: ; %end +; VI-NEXT: .LBB76_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -12884,7 +35397,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB76_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -12957,9 +35470,9 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: .LBB26_2: ; %Flow +; GFX9-NEXT: .LBB76_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_4 +; GFX9-NEXT: s_cbranch_execz .LBB76_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 @@ -13033,7 +35546,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX9-NEXT: .LBB26_4: ; %end +; GFX9-NEXT: .LBB76_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -13104,15 +35617,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_4 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_4 +; GFX11-TRUE16-NEXT: .LBB76_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB76_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l @@ -13225,8 +35738,8 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB76_2 +; GFX11-TRUE16-NEXT: .LBB76_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3 @@ -13390,15 +35903,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB76_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_4 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB76_4 +; GFX11-FAKE16-NEXT: .LBB76_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB26_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB76_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 @@ -13511,8 +36024,8 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-FAKE16-NEXT: .LBB26_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB76_2 +; GFX11-FAKE16-NEXT: .LBB76_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -13643,232 +36156,1384 @@ end: ret <5 x i64> %phi } +define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i8_to_v5i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_mov_b32_e32 v34, v14 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v31, v8 +; SI-NEXT: v_mov_b32_e32 v30, v6 +; SI-NEXT: v_mov_b32_e32 v29, v4 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v25 +; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v3, s7, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_or_b32_e32 v0, v0, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v0, v0, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v19 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: .LBB77_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v31 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB77_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB77_2 +; +; VI-LABEL: bitcast_v40i8_to_v5i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: v_mov_b32_e32 v36, v14 +; VI-NEXT: v_mov_b32_e32 v37, v13 +; VI-NEXT: v_mov_b32_e32 v38, v12 +; VI-NEXT: v_mov_b32_e32 v29, v10 +; VI-NEXT: v_mov_b32_e32 v28, v9 +; VI-NEXT: v_mov_b32_e32 v27, v8 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v31, v5 +; VI-NEXT: v_mov_b32_e32 v30, v4 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; VI-NEXT: s_cbranch_scc0 .LBB77_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; VI-NEXT: v_or_b32_e32 v3, s7, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; VI-NEXT: v_or_b32_sdwa v0, v32, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; VI-NEXT: v_or_b32_sdwa v0, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v36, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: v_or_b32_sdwa v0, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; VI-NEXT: v_or_b32_sdwa v0, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB77_3 +; VI-NEXT: .LBB77_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s19, 24 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s20, 0xff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_and_b32 s10, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_lshl_b32 s5, s27, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v20 +; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v38 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v33 +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s28, 0xff +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v35 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; VI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; VI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 +; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 +; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v37 +; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v28 +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v31 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v34 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_sdwa v18, v50, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_sdwa v16, v49, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_or_b32_sdwa v14, v48, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_or_b32_sdwa v12, v39, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_or_b32_sdwa v10, v26, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v3, v3, v20 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v5, v5, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v14 +; VI-NEXT: v_or_b32_e32 v6, v6, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v12 +; VI-NEXT: v_or_b32_e32 v7, v7, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v7, v7, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v8, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB77_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB77_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB77_2 +; +; GFX9-LABEL: bitcast_v40i8_to_v5i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v14 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v32, v10 +; GFX9-NEXT: v_mov_b32_e32 v31, v8 +; GFX9-NEXT: v_mov_b32_e32 v30, v6 +; GFX9-NEXT: v_mov_b32_e32 v29, v4 +; GFX9-NEXT: v_mov_b32_e32 v27, v2 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v35, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: s_cbranch_scc0 .LBB77_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s7, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB77_3 +; GFX9-NEXT: .LBB77_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v28 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v34 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB77_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB77_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB77_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v5i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB77_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB77_3 +; GFX11-TRUE16-NEXT: .LBB77_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v37 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v35 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v33 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB77_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB77_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB77_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v5i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v6 :: v_dual_mov_b32 v24, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v2 :: v_dual_mov_b32 v26, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB77_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v27 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v37 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v30 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v24 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB77_3 +; GFX11-FAKE16-NEXT: .LBB77_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v23 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s24, 0xff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v34, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v35, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v31, v6 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v37, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v36, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v24 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v33, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v22, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v17, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v19, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v32, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v21, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-FAKE16-NEXT: .LBB77_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB77_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB77_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v5i64_to_v40i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v16, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v16, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v48 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v38 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v37 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v34 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v20 -; GCN-NEXT: v_or_b32_e32 v35, v49, v35 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 24, v28 -; GCN-NEXT: v_or_b32_e32 v48, v50, v48 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GCN-NEXT: v_or_b32_e32 v3, v3, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 24, v22 -; GCN-NEXT: v_or_b32_e32 v4, v4, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GCN-NEXT: v_or_b32_e32 v5, v5, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v17 -; GCN-NEXT: v_or_b32_e32 v6, v6, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v7, v7, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v24 -; GCN-NEXT: v_or_b32_e32 v9, v9, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v10, v10, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v35 -; GCN-NEXT: v_or_b32_e32 v19, v32, v33 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v25, v38, v39 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v26, v26, v27 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v27, v34, v36 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v20, v20, v21 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v21, v28, v30 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v15, v22, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v12, v17, v13 -; GCN-NEXT: v_or_b32_e32 v13, v18, v19 -; GCN-NEXT: v_or_b32_e32 v16, v24, v25 -; GCN-NEXT: v_or_b32_e32 v3, v3, v26 -; GCN-NEXT: v_or_b32_e32 v4, v4, v27 -; GCN-NEXT: v_or_b32_e32 v5, v5, v20 -; GCN-NEXT: v_or_b32_e32 v6, v6, v21 -; GCN-NEXT: v_or_b32_e32 v7, v7, v14 -; GCN-NEXT: v_or_b32_e32 v8, v8, v15 -; GCN-NEXT: v_or_b32_e32 v9, v9, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v12 -; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5i64_to_v40i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: .LBB78_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: .LBB78_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v32, v32, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i64_to_v40i8: ; VI: ; %bb.0: @@ -13906,7 +37571,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB78_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -13938,9 +37603,9 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: .LBB78_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_4 +; VI-NEXT: s_cbranch_execz .LBB78_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc @@ -13982,7 +37647,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB27_4: ; %end +; VI-NEXT: .LBB78_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 @@ -14092,7 +37757,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB78_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -14124,9 +37789,9 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: .LBB78_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-NEXT: s_cbranch_execz .LBB78_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 3, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc @@ -14168,7 +37833,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB27_4: ; %end +; GFX9-NEXT: .LBB78_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 @@ -14260,7 +37925,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -14282,9 +37947,9 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB27_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB78_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, v3, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -14319,7 +37984,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB27_4: ; %end +; GFX11-TRUE16-NEXT: .LBB78_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -14457,7 +38122,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB78_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -14489,9 +38154,9 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB27_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB78_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB78_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, v3, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -14536,7 +38201,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB27_4: ; %end +; GFX11-FAKE16-NEXT: .LBB78_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -14651,24 +38316,1207 @@ end: ret <40 x i8> %phi } +define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i64_to_v40i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v3, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s25, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s25, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s25, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s23, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s21, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s19, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s17, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s17, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 8 +; SI-NEXT: s_lshr_b32 s6, s25, 24 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 8 +; SI-NEXT: s_lshr_b32 s9, s23, 24 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 8 +; SI-NEXT: s_lshr_b32 s12, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 8 +; SI-NEXT: s_lshr_b32 s15, s19, 24 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 8 +; SI-NEXT: s_lshr_b32 s28, s17, 24 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: .LBB79_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: v_mov_b32_e32 v3, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s25, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s25, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s25, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s23, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s21, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s19, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s17, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s17, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 8 +; SI-NEXT: s_lshr_b32 s6, s25, 24 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 8 +; SI-NEXT: s_lshr_b32 s9, s23, 24 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 8 +; SI-NEXT: s_lshr_b32 s12, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 8 +; SI-NEXT: s_lshr_b32 s15, s19, 24 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 8 +; SI-NEXT: s_lshr_b32 s28, s17, 24 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 8 +; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s28, 24 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s15, s15, 24 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s15, s5 +; SI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s14, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s13, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s12, s12, 24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s5, s11, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s10, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s9, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s7, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB79_4: +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB79_2 +; +; VI-LABEL: bitcast_v5i64_to_v40i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB79_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s28, s25, 8 +; VI-NEXT: s_lshr_b32 s29, s24, 16 +; VI-NEXT: s_lshr_b32 s40, s24, 8 +; VI-NEXT: s_lshr_b32 s41, s23, 24 +; VI-NEXT: s_lshr_b32 s42, s23, 16 +; VI-NEXT: s_lshr_b32 s43, s23, 8 +; VI-NEXT: s_lshr_b32 s44, s22, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s47, s21, 16 +; VI-NEXT: s_lshr_b32 s56, s21, 8 +; VI-NEXT: s_lshr_b32 s57, s20, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 8 +; VI-NEXT: s_lshr_b32 s59, s19, 24 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 8 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s18, 8 +; VI-NEXT: s_lshr_b32 s72, s17, 24 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 8 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s76, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB79_3 +; VI-NEXT: .LBB79_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s28, s25, 8 +; VI-NEXT: s_lshr_b32 s29, s24, 16 +; VI-NEXT: s_lshr_b32 s40, s24, 8 +; VI-NEXT: s_lshr_b32 s41, s23, 24 +; VI-NEXT: s_lshr_b32 s42, s23, 16 +; VI-NEXT: s_lshr_b32 s43, s23, 8 +; VI-NEXT: s_lshr_b32 s44, s22, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s47, s21, 16 +; VI-NEXT: s_lshr_b32 s56, s21, 8 +; VI-NEXT: s_lshr_b32 s57, s20, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 8 +; VI-NEXT: s_lshr_b32 s59, s19, 24 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 8 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s18, 8 +; VI-NEXT: s_lshr_b32 s72, s17, 24 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 8 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s76, s16, 8 +; VI-NEXT: .LBB79_3: ; %end +; VI-NEXT: s_and_b32 s5, s16, 0xff +; VI-NEXT: s_lshl_b32 s7, s76, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s75, 0xff +; VI-NEXT: s_lshl_b32 s9, s12, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_and_b32 s5, s17, 0xff +; VI-NEXT: s_lshl_b32 s7, s74, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s73, 0xff +; VI-NEXT: s_lshl_b32 s9, s72, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s7, s63, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s62, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s19, 0xff +; VI-NEXT: s_lshl_b32 s7, s61, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s60, 0xff +; VI-NEXT: s_lshl_b32 s9, s59, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s7, s58, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s57, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s21, 0xff +; VI-NEXT: s_lshl_b32 s7, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s47, 0xff +; VI-NEXT: s_lshl_b32 s8, s46, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s45, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s44, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s23, 0xff +; VI-NEXT: s_lshl_b32 s6, s43, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s42, 0xff +; VI-NEXT: s_lshl_b32 s7, s41, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_lshl_b32 s6, s40, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s29, 0xff +; VI-NEXT: s_lshl_b32 s4, s4, 8 +; VI-NEXT: s_or_b32 s4, s6, s4 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s25, 0xff +; VI-NEXT: s_lshl_b32 s5, s28, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s27, 0xff +; VI-NEXT: s_lshl_b32 s6, s26, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB79_4: +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: s_branch .LBB79_2 +; +; GFX9-LABEL: bitcast_v5i64_to_v40i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s28, s25, 8 +; GFX9-NEXT: s_lshr_b32 s29, s24, 16 +; GFX9-NEXT: s_lshr_b32 s40, s24, 8 +; GFX9-NEXT: s_lshr_b32 s41, s23, 24 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s43, s23, 8 +; GFX9-NEXT: s_lshr_b32 s44, s22, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 8 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: s_lshr_b32 s47, s21, 16 +; GFX9-NEXT: s_lshr_b32 s56, s21, 8 +; GFX9-NEXT: s_lshr_b32 s57, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s20, 8 +; GFX9-NEXT: s_lshr_b32 s59, s19, 24 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s19, 8 +; GFX9-NEXT: s_lshr_b32 s62, s18, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 8 +; GFX9-NEXT: s_lshr_b32 s72, s17, 24 +; GFX9-NEXT: s_lshr_b32 s73, s17, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 8 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_lshr_b32 s76, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB79_3 +; GFX9-NEXT: .LBB79_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s28, s25, 8 +; GFX9-NEXT: s_lshr_b32 s29, s24, 16 +; GFX9-NEXT: s_lshr_b32 s40, s24, 8 +; GFX9-NEXT: s_lshr_b32 s41, s23, 24 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s43, s23, 8 +; GFX9-NEXT: s_lshr_b32 s44, s22, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 8 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: s_lshr_b32 s47, s21, 16 +; GFX9-NEXT: s_lshr_b32 s56, s21, 8 +; GFX9-NEXT: s_lshr_b32 s57, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s20, 8 +; GFX9-NEXT: s_lshr_b32 s59, s19, 24 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s19, 8 +; GFX9-NEXT: s_lshr_b32 s62, s18, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 8 +; GFX9-NEXT: s_lshr_b32 s72, s17, 24 +; GFX9-NEXT: s_lshr_b32 s73, s17, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 8 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_lshr_b32 s76, s16, 8 +; GFX9-NEXT: .LBB79_3: ; %end +; GFX9-NEXT: s_and_b32 s5, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s76, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s75, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s12, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s74, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s73, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s72, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s63, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s62, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s61, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s60, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s59, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s58, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s57, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s56, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s47, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s46, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s45, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s44, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s43, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s42, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s41, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s40, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_or_b32 s4, s6, s4 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_and_b32 s4, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s28, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s26, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB79_4: +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: s_branch .LBB79_2 +; +; GFX11-TRUE16-LABEL: bitcast_v5i64_to_v40i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s63 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB79_3 +; GFX11-TRUE16-NEXT: .LBB79_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-TRUE16-NEXT: .LBB79_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s62 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s61 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s12 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s60 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s59 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s58 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s57 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s47 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s45 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s40 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s25 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s14 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB79_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB79_2 +; +; GFX11-FAKE16-LABEL: bitcast_v5i64_to_v40i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s63, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s63 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB79_3 +; GFX11-FAKE16-NEXT: .LBB79_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-FAKE16-NEXT: .LBB79_3: ; %end +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s62, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s61, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s12, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s60, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s59, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s58, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s57, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s56, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s10, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s47, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s46, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s45, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s44, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s43, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s42, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s41, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s40, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s19, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s27, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s24, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s4, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s22, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s15, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s14, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB79_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: s_branch .LBB79_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v5f64_to_v5i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: .LBB28_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5f64_to_v5i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: .LBB80_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f64_to_v5i64: ; VI: ; %bb.0: @@ -14677,14 +39525,14 @@ define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB80_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB80_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -14695,14 +39543,14 @@ define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB80_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB80_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -14714,14 +39562,14 @@ define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: s_cbranch_execz .LBB80_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: .LBB28_2: ; %end +; GFX11-NEXT: .LBB80_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -14741,29 +39589,184 @@ end: ret <5 x i64> %phi } +define inreg <5 x i64> @bitcast_v5f64_to_v5i64_scalar(<5 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f64_to_v5i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB81_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB81_4 +; SI-NEXT: .LBB81_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB81_3: +; SI-NEXT: s_branch .LBB81_2 +; SI-NEXT: .LBB81_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v5i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB81_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB81_4 +; VI-NEXT: .LBB81_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB81_3: +; VI-NEXT: s_branch .LBB81_2 +; VI-NEXT: .LBB81_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v5i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB81_4 +; GFX9-NEXT: .LBB81_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB81_3: +; GFX9-NEXT: s_branch .LBB81_2 +; GFX9-NEXT: .LBB81_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v5i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB81_4 +; GFX11-NEXT: .LBB81_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB81_3: +; GFX11-NEXT: s_branch .LBB81_2 +; GFX11-NEXT: .LBB81_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v5i64_to_v5f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5i64_to_v5f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: .LBB82_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i64_to_v5f64: ; VI: ; %bb.0: @@ -14772,7 +39775,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB82_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -14784,7 +39787,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -14795,7 +39798,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB82_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -14807,7 +39810,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB82_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -14819,7 +39822,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -14834,7 +39837,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: .LBB29_2: ; %end +; GFX11-NEXT: .LBB82_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -14853,3 +39856,149 @@ end: %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <5 x double> %phi } + +define inreg <5 x double> @bitcast_v5i64_to_v5f64_scalar(<5 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i64_to_v5f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB83_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB83_3 +; SI-NEXT: .LBB83_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: .LBB83_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB83_4: +; SI-NEXT: s_branch .LBB83_2 +; +; VI-LABEL: bitcast_v5i64_to_v5f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB83_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB83_3 +; VI-NEXT: .LBB83_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: .LBB83_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB83_4: +; VI-NEXT: s_branch .LBB83_2 +; +; GFX9-LABEL: bitcast_v5i64_to_v5f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB83_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB83_3 +; GFX9-NEXT: .LBB83_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: .LBB83_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB83_4: +; GFX9-NEXT: s_branch .LBB83_2 +; +; GFX11-LABEL: bitcast_v5i64_to_v5f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB83_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB83_3 +; GFX11-NEXT: .LBB83_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: .LBB83_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB83_4: +; GFX11-NEXT: s_branch .LBB83_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index 7f8b733038f1e..da529d9dd3048 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -1,25 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define float @bitcast_i32_to_f32(i32 %a, i32 %b) { -; GCN-LABEL: bitcast_i32_to_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i32_to_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i32_to_f32: ; VI: ; %bb.0: @@ -77,20 +76,100 @@ end: ret float %phi } +define inreg float @bitcast_i32_to_f32_scalar(i32 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i32_to_f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_i32_to_f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_i32_to_f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_i32_to_f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_3: +; GFX11-NEXT: .LBB1_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to float + br label %end + +cmp.false: + %a3 = bitcast i32 %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + define i32 @bitcast_f32_to_i32(float %a, i32 %b) { -; GCN-LABEL: bitcast_f32_to_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f32_to_i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_i32: ; VI: ; %bb.0: @@ -148,30 +227,113 @@ end: ret i32 %phi } +define inreg i32 @bitcast_f32_to_i32_scalar(float inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f32_to_i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f32_to_i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f32_to_i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast float %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + define <2 x i16> @bitcast_i32_to_v2i16(i32 %a, i32 %b) { -; GCN-LABEL: bitcast_i32_to_v2i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB2_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i32_to_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_4 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB4_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: .LBB4_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i32_to_v2i16: ; VI: ; %bb.0: @@ -229,37 +391,122 @@ end: ret <2 x i16> %phi } +define inreg <2 x i16> @bitcast_i32_to_v2i16_scalar(i32 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i32_to_v2i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_i32_to_v2i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_i32_to_v2i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_i32_to_v2i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_3: +; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast i32 %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + define i32 @bitcast_v2i16_to_i32(<2 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i16_to_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i16_to_i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i16_to_i32: ; VI: ; %bb.0: @@ -320,37 +567,130 @@ end: ret i32 %phi } +define inreg i32 @bitcast_v2i16_to_i32_scalar(<2 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i16_to_i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v2i16_to_i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v2i16_to_i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i16_to_i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: s_branch .LBB7_2 +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + define <2 x half> @bitcast_i32_to_v2f16(i32 %a, i32 %b) { -; GCN-LABEL: bitcast_i32_to_v2f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_4 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB4_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: .LBB4_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i32_to_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_4 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB8_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: .LBB8_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i32_to_v2f16: ; VI: ; %bb.0: @@ -408,41 +748,129 @@ end: ret <2 x half> %phi } +define inreg <2 x half> @bitcast_i32_to_v2f16_scalar(i32 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i32_to_v2f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_i32_to_v2f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_i32_to_v2f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_i32_to_v2f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB9_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: .LBB9_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast i32 %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + define i32 @bitcast_v2f16_to_i32(<2 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f16_to_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_4 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB5_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: .LBB5_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f16_to_i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_i32: ; VI: ; %bb.0: @@ -504,35 +932,134 @@ end: ret i32 %phi } +define inreg i32 @bitcast_v2f16_to_i32_scalar(<2 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f16_to_i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v2f16_to_i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f16_to_i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + define <2 x bfloat> @bitcast_i32_to_v2bf16(i32 %a, i32 %b) { -; GCN-LABEL: bitcast_i32_to_v2bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_4 -; GCN-NEXT: .LBB6_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB6_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: .LBB6_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i32_to_v2bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_4 +; SI-NEXT: .LBB12_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB12_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: .LBB12_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i32_to_v2bf16: ; VI: ; %bb.0: @@ -590,39 +1117,127 @@ end: ret <2 x bfloat> %phi } +define inreg <2 x bfloat> @bitcast_i32_to_v2bf16_scalar(i32 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i32_to_v2bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_i32_to_v2bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_i32_to_v2bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_i32_to_v2bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB13_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_3: +; GFX11-NEXT: .LBB13_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast i32 %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v2bf16_to_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2bf16_to_i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2bf16_to_i32: ; VI: ; %bb.0: @@ -631,7 +1246,7 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -651,7 +1266,7 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -662,7 +1277,7 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -681,7 +1296,7 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s6, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6 -; GFX9-NEXT: .LBB7_2: ; %end +; GFX9-NEXT: .LBB14_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -693,7 +1308,7 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -714,7 +1329,7 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -726,7 +1341,7 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -746,7 +1361,7 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -766,47 +1381,198 @@ end: ret i32 %phi } -define <1 x i32> @bitcast_i32_to_v1i32(i32 %a, i32 %b) { -; GCN-LABEL: bitcast_i32_to_v1i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_i32_to_v1i32: +define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2bf16_to_i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v2bf16_to_i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_4 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_3: +; VI-NEXT: s_branch .LBB15_2 +; VI-NEXT: .LBB15_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_i32_to_v1i32: +; GFX9-LABEL: bitcast_v2bf16_to_i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2bf16_to_i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + +define <1 x i32> @bitcast_i32_to_v1i32(i32 %a, i32 %b) { +; SI-LABEL: bitcast_i32_to_v1i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i32_to_v1i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i32_to_v1i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; ; GFX11-LABEL: bitcast_i32_to_v1i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -837,20 +1603,100 @@ end: ret <1 x i32> %phi } +define inreg <1 x i32> @bitcast_i32_to_v1i32_scalar(i32 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i32_to_v1i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_i32_to_v1i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_i32_to_v1i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_i32_to_v1i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB17_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_3: +; GFX11-NEXT: .LBB17_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast i32 %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} + define i32 @bitcast_v1i32_to_i32(<1 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v1i32_to_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v1i32_to_i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v1i32_to_i32: ; VI: ; %bb.0: @@ -908,36 +1754,117 @@ end: ret i32 %phi } +define inreg i32 @bitcast_v1i32_to_i32_scalar(<1 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v1i32_to_i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v1i32_to_i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v1i32_to_i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-LABEL: bitcast_v1i32_to_i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB19_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: .LBB19_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + define <4 x i8> @bitcast_i32_to_v4i8(i32 %a, i32 %b) { -; GCN-LABEL: bitcast_i32_to_v4i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i32_to_v4i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i32_to_v4i8: ; VI: ; %bb.0: @@ -948,20 +1875,20 @@ define <4 x i8> @bitcast_i32_to_v4i8(i32 %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB10_3 +; VI-NEXT: s_cbranch_execnz .LBB20_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB10_4 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB20_4 +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB10_3: ; %cmp.false +; VI-NEXT: .LBB20_3: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 -; VI-NEXT: .LBB10_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: .LBB20_4: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 @@ -978,20 +1905,20 @@ define <4 x i8> @bitcast_i32_to_v4i8(i32 %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB10_3 +; GFX9-NEXT: s_cbranch_execnz .LBB20_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB10_4 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB20_4 +; GFX9-NEXT: .LBB20_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB10_3: ; %cmp.false +; GFX9-NEXT: .LBB20_3: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 -; GFX9-NEXT: .LBB10_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: .LBB20_4: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 @@ -1036,20 +1963,20 @@ define <4 x i8> @bitcast_i32_to_v4i8(i32 %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_4 -; GFX11-FAKE16-NEXT: .LBB10_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_4 +; GFX11-FAKE16-NEXT: .LBB20_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB10_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB20_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB10_2 -; GFX11-FAKE16-NEXT: .LBB10_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: .LBB20_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -1074,52 +2001,209 @@ end: ret <4 x i8> %phi } +define inreg <4 x i8> @bitcast_i32_to_v4i8_scalar(i32 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i32_to_v4i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s6, s16, 24 +; SI-NEXT: s_lshr_b32 s7, s16, 16 +; SI-NEXT: s_lshr_b32 s8, s16, 8 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b32 s7, s16, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 24 +; SI-NEXT: s_lshr_b32 s8, s16, 8 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_i32_to_v4i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s6, s16, 24 +; VI-NEXT: s_lshr_b32 s7, s16, 16 +; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s7, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 24 +; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_i32_to_v4i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s6, s16, 24 +; GFX9-NEXT: s_lshr_b32 s7, s16, 16 +; GFX9-NEXT: s_lshr_b32 s8, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s7, s16, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 24 +; GFX9-NEXT: s_lshr_b32 s8, s16, 8 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr7 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-TRUE16-LABEL: bitcast_i32_to_v4i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB21_3 +; GFX11-TRUE16-NEXT: .LBB21_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-TRUE16-NEXT: .LBB21_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB21_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr1_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB21_2 +; +; GFX11-FAKE16-LABEL: bitcast_i32_to_v4i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB21_3 +; GFX11-FAKE16-NEXT: .LBB21_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-FAKE16-NEXT: .LBB21_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB21_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr1 +; GFX11-FAKE16-NEXT: s_branch .LBB21_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast i32 %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i8_to_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_4 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB11_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: .LBB11_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i8_to_i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_4 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB22_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: .LBB22_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i8_to_i32: ; VI: ; %bb.0: @@ -1131,14 +2215,14 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: s_cbranch_execnz .LBB22_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB11_4 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB22_4 +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB11_3: ; %cmp.false +; VI-NEXT: .LBB22_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -1147,8 +2231,8 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 -; VI-NEXT: .LBB11_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: .LBB22_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v5 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1170,14 +2254,14 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: s_cbranch_execnz .LBB22_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB11_4 -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB22_4 +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB11_3: ; %cmp.false +; GFX9-NEXT: .LBB22_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -1186,8 +2270,8 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 -; GFX9-NEXT: .LBB11_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: .LBB22_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1209,14 +2293,14 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB22_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_4 -; GFX11-TRUE16-NEXT: .LBB11_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB22_4 +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB11_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 @@ -1232,8 +2316,8 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB11_2 -; GFX11-TRUE16-NEXT: .LBB11_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v2.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1263,14 +2347,14 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB22_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_4 -; GFX11-FAKE16-NEXT: .LBB11_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB22_4 +; GFX11-FAKE16-NEXT: .LBB22_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB11_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 @@ -1286,8 +2370,8 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB11_2 -; GFX11-FAKE16-NEXT: .LBB11_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: .LBB22_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v5, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v2, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1323,30 +2407,200 @@ end: ret i32 %phi } +define inreg i32 @bitcast_v4i8_to_i32_scalar(<4 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i8_to_i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s6, s4, 0x3000000 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v4i8_to_i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_and_b32 s6, s16, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s4, 0x3000000 +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v4i8_to_i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v4i8_to_i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s8 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: .LBB23_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: s_branch .LBB23_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + define <2 x i16> @bitcast_f32_to_v2i16(float %a, i32 %b) { -; GCN-LABEL: bitcast_f32_to_v2i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB12_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB12_4 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB12_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: .LBB12_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f32_to_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB24_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB24_4 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB24_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: .LBB24_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_v2i16: ; VI: ; %bb.0: @@ -1404,37 +2658,124 @@ end: ret <2 x i16> %phi } +define inreg <2 x i16> @bitcast_f32_to_v2i16_scalar(float inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f32_to_v2i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB25_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB25_4 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB25_2 +; SI-NEXT: .LBB25_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f32_to_v2i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB25_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_4 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_3: +; VI-NEXT: s_branch .LBB25_2 +; VI-NEXT: .LBB25_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_v2i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_4 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_3: +; GFX9-NEXT: s_branch .LBB25_2 +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f32_to_v2i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: s_branch .LBB25_2 +; GFX11-NEXT: .LBB25_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast float %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + define float @bitcast_v2i16_to_f32(<2 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i16_to_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_4 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB13_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: .LBB13_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i16_to_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_4 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB26_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: .LBB26_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i16_to_f32: ; VI: ; %bb.0: @@ -1495,37 +2836,130 @@ end: ret float %phi } +define inreg float @bitcast_v2i16_to_f32_scalar(<2 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i16_to_f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v2i16_to_f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v2i16_to_f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_4 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_3: +; GFX9-NEXT: s_branch .LBB27_2 +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i16_to_f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_4 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: s_branch .LBB27_2 +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to float + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + define <2 x half> @bitcast_f32_to_v2f16(float %a, i32 %b) { -; GCN-LABEL: bitcast_f32_to_v2f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB14_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB14_4 -; GCN-NEXT: .LBB14_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB14_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: .LBB14_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f32_to_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB28_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB28_4 +; SI-NEXT: .LBB28_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB28_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: .LBB28_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_v2f16: ; VI: ; %bb.0: @@ -1583,41 +3017,130 @@ end: ret <2 x half> %phi } +define inreg <2 x half> @bitcast_f32_to_v2f16_scalar(float inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f32_to_v2f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_f32_to_v2f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_v2f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f32_to_v2f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast float %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + define float @bitcast_v2f16_to_f32(<2 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f16_to_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_4 -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB15_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: .LBB15_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f16_to_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_4 +; SI-NEXT: .LBB30_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB30_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: .LBB30_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_f32: ; VI: ; %bb.0: @@ -1679,35 +3202,134 @@ end: ret float %phi } +define inreg float @bitcast_v2f16_to_f32_scalar(<2 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f16_to_f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v2f16_to_f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB31_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_4 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_3: +; VI-NEXT: s_branch .LBB31_2 +; VI-NEXT: .LBB31_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_4 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_3: +; GFX9-NEXT: s_branch .LBB31_2 +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f16_to_f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_3: +; GFX11-NEXT: s_branch .LBB31_2 +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to float + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + define <2 x bfloat> @bitcast_f32_to_v2bf16(float %a, i32 %b) { -; GCN-LABEL: bitcast_f32_to_v2bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_4 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB16_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: .LBB16_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f32_to_v2bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_4 +; SI-NEXT: .LBB32_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB32_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: .LBB32_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_v2bf16: ; VI: ; %bb.0: @@ -1765,39 +3387,129 @@ end: ret <2 x bfloat> %phi } +define inreg <2 x bfloat> @bitcast_f32_to_v2bf16_scalar(float inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f32_to_v2bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB33_4 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_3: +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f32_to_v2bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_3: +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_v2bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f32_to_v2bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast float %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v2bf16_to_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB17_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB17_4 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB17_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: .LBB17_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2bf16_to_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB34_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB34_4 +; SI-NEXT: .LBB34_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB34_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: .LBB34_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2bf16_to_f32: ; VI: ; %bb.0: @@ -1806,7 +3518,7 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -1826,7 +3538,7 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1837,7 +3549,7 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -1856,7 +3568,7 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s6, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6 -; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: .LBB34_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1868,7 +3580,7 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -1889,7 +3601,7 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 -; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1901,7 +3613,7 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -1921,7 +3633,7 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1941,20 +3653,171 @@ end: ret float %phi } +define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2bf16_to_f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v2bf16_to_f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB35_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: +; VI-NEXT: s_branch .LBB35_2 +; VI-NEXT: .LBB35_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2bf16_to_f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: +; GFX9-NEXT: s_branch .LBB35_2 +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2bf16_to_f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to float + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + define <1 x i32> @bitcast_f32_to_v1i32(float %a, i32 %b) { -; GCN-LABEL: bitcast_f32_to_v1i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f32_to_v1i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_v1i32: ; VI: ; %bb.0: @@ -2012,20 +3875,102 @@ end: ret <1 x i32> %phi } +define inreg <1 x i32> @bitcast_f32_to_v1i32_scalar(float inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f32_to_v1i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB37_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_4 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_3: +; SI-NEXT: s_branch .LBB37_2 +; SI-NEXT: .LBB37_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f32_to_v1i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB37_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_4 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_3: +; VI-NEXT: s_branch .LBB37_2 +; VI-NEXT: .LBB37_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_v1i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_4 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_3: +; GFX9-NEXT: s_branch .LBB37_2 +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f32_to_v1i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: s_branch .LBB37_2 +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast float %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} + define float @bitcast_v1i32_to_f32(<1 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v1i32_to_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v1i32_to_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v1i32_to_f32: ; VI: ; %bb.0: @@ -2083,36 +4028,117 @@ end: ret float %phi } +define inreg float @bitcast_v1i32_to_f32_scalar(<1 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v1i32_to_f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v1i32_to_f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v1i32_to_f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v1i32_to_f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB39_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: .LBB39_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to float + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + define <4 x i8> @bitcast_f32_to_v4i8(float %a, i32 %b) { -; GCN-LABEL: bitcast_f32_to_v4i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB20_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB20_4 -; GCN-NEXT: .LBB20_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB20_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: .LBB20_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f32_to_v4i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB40_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB40_4 +; SI-NEXT: .LBB40_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB40_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: .LBB40_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_v4i8: ; VI: ; %bb.0: @@ -2123,20 +4149,20 @@ define <4 x i8> @bitcast_f32_to_v4i8(float %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB20_3 +; VI-NEXT: s_cbranch_execnz .LBB40_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB20_4 -; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB40_4 +; VI-NEXT: .LBB40_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB20_3: ; %cmp.false +; VI-NEXT: .LBB40_3: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 -; VI-NEXT: .LBB20_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: .LBB40_4: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 @@ -2153,20 +4179,20 @@ define <4 x i8> @bitcast_f32_to_v4i8(float %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB20_3 +; GFX9-NEXT: s_cbranch_execnz .LBB40_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB20_4 -; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB40_4 +; GFX9-NEXT: .LBB40_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB20_3: ; %cmp.false +; GFX9-NEXT: .LBB40_3: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 -; GFX9-NEXT: .LBB20_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: .LBB40_4: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 @@ -2211,20 +4237,20 @@ define <4 x i8> @bitcast_f32_to_v4i8(float %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_4 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_4 +; GFX11-FAKE16-NEXT: .LBB40_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB20_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB40_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-FAKE16-NEXT: .LBB40_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -2249,52 +4275,213 @@ end: ret <4 x i8> %phi } +define inreg <4 x i8> @bitcast_f32_to_v4i8_scalar(float inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f32_to_v4i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB41_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s7, s16, 24 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s8, s16, 8 +; SI-NEXT: s_cbranch_execnz .LBB41_4 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_3: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: s_branch .LBB41_2 +; SI-NEXT: .LBB41_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f32_to_v4i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB41_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s7, s16, 24 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB41_4 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_3: +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: s_branch .LBB41_2 +; VI-NEXT: .LBB41_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_v4i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s7, s16, 24 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_lshr_b32 s8, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB41_4 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_3: +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr7 +; GFX9-NEXT: s_branch .LBB41_2 +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_f32_to_v4i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_4 +; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB41_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB41_2 +; GFX11-TRUE16-NEXT: .LBB41_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_f32_to_v4i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB41_4 +; GFX11-FAKE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB41_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: s_branch .LBB41_2 +; GFX11-FAKE16-NEXT: .LBB41_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast float %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i8_to_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_4 -; GCN-NEXT: .LBB21_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB21_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: .LBB21_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i8_to_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_4 +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB42_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: .LBB42_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i8_to_f32: ; VI: ; %bb.0: @@ -2306,14 +4493,14 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: s_cbranch_execnz .LBB42_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB21_4 -; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB42_4 +; VI-NEXT: .LBB42_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB21_3: ; %cmp.false +; VI-NEXT: .LBB42_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2322,8 +4509,8 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 -; VI-NEXT: .LBB21_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: .LBB42_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v5 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -2345,14 +4532,14 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: s_cbranch_execnz .LBB42_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB21_4 -; GFX9-NEXT: .LBB21_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB42_4 +; GFX9-NEXT: .LBB42_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB21_3: ; %cmp.false +; GFX9-NEXT: .LBB42_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2361,8 +4548,8 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 -; GFX9-NEXT: .LBB21_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: .LBB42_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -2384,14 +4571,14 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB42_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_4 -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB42_4 +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB21_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB42_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 @@ -2407,8 +4594,8 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-TRUE16-NEXT: .LBB21_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-TRUE16-NEXT: .LBB42_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v2.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -2438,14 +4625,14 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB42_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_4 -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB42_4 +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB21_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB42_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 @@ -2461,8 +4648,8 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-FAKE16-NEXT: .LBB21_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-FAKE16-NEXT: .LBB42_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v5, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v2, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -2498,38 +4685,208 @@ end: ret float %phi } +define inreg float @bitcast_v4i8_to_f32_scalar(<4 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i8_to_f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s6, s4, 0x3000000 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v4i8_to_f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_and_b32 s6, s16, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s4, 0x3000000 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v4i8_to_f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-LABEL: bitcast_v4i8_to_f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s8 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: .LBB43_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: s_branch .LBB43_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to float + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + define <2 x half> @bitcast_v2i16_to_v2f16(<2 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i16_to_v2f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_mov_b32_e32 v3, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_4 -; GCN-NEXT: .LBB22_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB22_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: .LBB22_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i16_to_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_4 +; SI-NEXT: .LBB44_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB44_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: .LBB44_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i16_to_v2f16: ; VI: ; %bb.0: @@ -2590,29 +4947,120 @@ end: ret <2 x half> %phi } +define inreg <2 x half> @bitcast_v2i16_to_v2f16_scalar(<2 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i16_to_v2f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v2i16_to_v2f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v2i16_to_v2f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_4 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_3: +; GFX9-NEXT: s_branch .LBB45_2 +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i16_to_v2f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_4 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_3: +; GFX11-NEXT: s_branch .LBB45_2 +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + define <2 x i16> @bitcast_v2f16_to_v2i16(<2 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f16_to_v2i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: .LBB23_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f16_to_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v2i16: ; VI: ; %bb.0: @@ -2674,23 +5122,118 @@ end: ret <2 x i16> %phi } +define inreg <2 x i16> @bitcast_v2f16_to_v2i16_scalar(<2 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f16_to_v2i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v2f16_to_v2i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_e32 v1, s16, v0 +; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_v2i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f16_to_v2i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_3: +; GFX11-NEXT: s_branch .LBB47_2 +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + define <2 x bfloat> @bitcast_v2i16_to_v2bf16(<2 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i16_to_v2bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: .LBB24_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i16_to_v2bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i16_to_v2bf16: ; VI: ; %bb.0: @@ -2721,18 +5264,111 @@ define <2 x bfloat> @bitcast_v2i16_to_v2bf16(<2 x i16> %a, i32 %b) { ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v2i16_to_v2bf16: +; GFX11-LABEL: bitcast_v2i16_to_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + +define inreg <2 x bfloat> @bitcast_v2i16_to_v2bf16_scalar(<2 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i16_to_v2bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: s_lshl_b32 s4, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s16, 16 +; SI-NEXT: s_add_i32 s6, s5, 0x30000 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v2i16_to_v2bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v2i16_to_v2bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i16_to_v2bf16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-NEXT: .LBB49_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB49_3: +; GFX11-NEXT: s_branch .LBB49_2 +; GFX11-NEXT: .LBB49_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2752,39 +5388,39 @@ end: } define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v2bf16_to_v2i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_4 -; GCN-NEXT: .LBB25_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB25_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: .LBB25_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2bf16_to_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_4 +; SI-NEXT: .LBB50_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB50_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: .LBB50_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2bf16_to_v2i16: ; VI: ; %bb.0: @@ -2793,7 +5429,7 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -2813,7 +5449,7 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2824,7 +5460,7 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -2843,7 +5479,7 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s6, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6 -; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: .LBB50_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2855,7 +5491,7 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2878,7 +5514,7 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2890,7 +5526,7 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -2910,7 +5546,7 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2930,37 +5566,187 @@ end: ret <2 x i16> %phi } +define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2bf16_to_v2i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v2bf16_to_v2i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB51_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB51_4 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_3: +; VI-NEXT: s_branch .LBB51_2 +; VI-NEXT: .LBB51_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2bf16_to_v2i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB51_4 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_3: +; GFX9-NEXT: s_branch .LBB51_2 +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2bf16_to_v2i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB51_4 +; GFX11-NEXT: .LBB51_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB51_3: +; GFX11-NEXT: s_branch .LBB51_2 +; GFX11-NEXT: .LBB51_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + define <1 x i32> @bitcast_v2i16_to_v1i32(<2 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i16_to_v1i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB26_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB26_4 -; GCN-NEXT: .LBB26_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB26_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: .LBB26_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i16_to_v1i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB52_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB52_4 +; SI-NEXT: .LBB52_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB52_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: .LBB52_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i16_to_v1i32: ; VI: ; %bb.0: @@ -3021,30 +5807,123 @@ end: ret <1 x i32> %phi } +define inreg <1 x i32> @bitcast_v2i16_to_v1i32_scalar(<2 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i16_to_v1i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v2i16_to_v1i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v2i16_to_v1i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_4 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_3: +; GFX9-NEXT: s_branch .LBB53_2 +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i16_to_v1i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_4 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_3: +; GFX11-NEXT: s_branch .LBB53_2 +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} + define <2 x i16> @bitcast_v1i32_to_v2i16(<1 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v1i32_to_v2i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB27_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB27_4 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB27_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: .LBB27_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v1i32_to_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_4 +; SI-NEXT: .LBB54_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB54_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: .LBB54_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v1i32_to_v2i16: ; VI: ; %bb.0: @@ -3102,48 +5981,133 @@ end: ret <2 x i16> %phi } +define inreg <2 x i16> @bitcast_v1i32_to_v2i16_scalar(<1 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v1i32_to_v2i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v1i32_to_v2i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v1i32_to_v2i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-LABEL: bitcast_v1i32_to_v2i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB55_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: .LBB55_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i16_to_v4i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB28_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB28_4 -; GCN-NEXT: .LBB28_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB28_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: v_bfe_u32 v3, v4, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: .LBB28_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: v_bfe_u32 v3, v3, 8, 8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i16_to_v4i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB56_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB56_4 +; SI-NEXT: .LBB56_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB56_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_bfe_u32 v3, v4, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: .LBB56_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; SI-NEXT: v_bfe_u32 v3, v3, 8, 8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i16_to_v4i8: ; VI: ; %bb.0: @@ -3156,21 +6120,21 @@ define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB28_3 +; VI-NEXT: s_cbranch_execnz .LBB56_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB28_4 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB56_4 +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB28_3: ; %cmp.false +; VI-NEXT: .LBB56_3: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 ; VI-NEXT: v_mov_b32_e32 v0, v4 ; VI-NEXT: ; implicit-def: $vgpr4 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 -; VI-NEXT: .LBB28_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB56_2 +; VI-NEXT: .LBB56_4: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v1, 3 ; VI-NEXT: v_add_u16_sdwa v2, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v0, 3, v4 @@ -3190,20 +6154,20 @@ define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB28_3 +; GFX9-NEXT: s_cbranch_execnz .LBB56_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB28_4 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB56_4 +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB28_3: ; %cmp.false +; GFX9-NEXT: .LBB56_3: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 -; GFX9-NEXT: .LBB28_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB56_2 +; GFX9-NEXT: .LBB56_4: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 @@ -3248,20 +6212,20 @@ define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_4 -; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_4 +; GFX11-FAKE16-NEXT: .LBB56_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB28_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB56_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 -; GFX11-FAKE16-NEXT: .LBB28_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 +; GFX11-FAKE16-NEXT: .LBB56_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -3286,55 +6250,226 @@ end: ret <4 x i8> %phi } +define inreg <4 x i8> @bitcast_v2i16_to_v4i8_scalar(<2 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i16_to_v4i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_lshr_b32 s7, s6, 8 +; SI-NEXT: s_and_b32 s8, s17, 0xffff +; SI-NEXT: s_bfe_u32 s9, s17, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_lshr_b32 s7, s6, 8 +; SI-NEXT: s_and_b32 s8, s17, 0xffff +; SI-NEXT: s_bfe_u32 s9, s17, 0x80008 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v2i16_to_v4i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s7, s16, 24 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: s_add_i32 s6, s4, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshr_b32 s8, s4, 8 +; VI-NEXT: s_bfe_u32 s7, s6, 0x80008 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v2i16_to_v4i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s7, s16, 24 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_lshr_b32 s8, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr7 +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v2i16_to_v4i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB57_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v2i16_to_v4i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB57_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: s_branch .LBB57_2 +; GFX11-FAKE16-NEXT: .LBB57_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i8_to_v2i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_4 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB29_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: .LBB29_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i8_to_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_4 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB58_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: .LBB58_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i8_to_v2i16: ; VI: ; %bb.0: @@ -3346,14 +6481,14 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: s_cbranch_execnz .LBB58_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB29_4 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB58_4 +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB29_3: ; %cmp.false +; VI-NEXT: .LBB58_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3362,8 +6497,8 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 -; VI-NEXT: .LBB29_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB58_2 +; VI-NEXT: .LBB58_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v5 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3385,14 +6520,14 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: s_cbranch_execnz .LBB58_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB29_4 -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB58_4 +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB29_3: ; %cmp.false +; GFX9-NEXT: .LBB58_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3401,8 +6536,8 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 -; GFX9-NEXT: .LBB29_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB58_2 +; GFX9-NEXT: .LBB58_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3424,14 +6559,14 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_4 -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_4 +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB29_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 @@ -3447,8 +6582,8 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 -; GFX11-TRUE16-NEXT: .LBB29_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 +; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v2.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -3478,14 +6613,14 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_4 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_4 +; GFX11-FAKE16-NEXT: .LBB58_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB29_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB58_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 @@ -3501,8 +6636,8 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 -; GFX11-FAKE16-NEXT: .LBB29_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 +; GFX11-FAKE16-NEXT: .LBB58_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v5, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v2, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -3538,42 +6673,216 @@ end: ret <2 x i16> %phi } +define inreg <2 x i16> @bitcast_v4i8_to_v2i16_scalar(<4 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i8_to_v2i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s19, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_or_b32 s6, s4, s6 +; SI-NEXT: s_and_b32 s7, s5, 0xffff +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s19, 8 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_or_b32 s6, s4, s6 +; SI-NEXT: s_and_b32 s7, s5, 0xffff +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v4i8_to_v2i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB59_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_cbranch_execnz .LBB59_3 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_and_b32 s6, s16, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s4, 0x3000000 +; VI-NEXT: .LBB59_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_4: +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB59_2 +; +; GFX9-LABEL: bitcast_v4i8_to_v2i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_cbranch_execnz .LBB59_3 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: .LBB59_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB59_2 +; +; GFX11-LABEL: bitcast_v4i8_to_v2i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s8 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_3 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: .LBB59_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: s_branch .LBB59_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + define <2 x bfloat> @bitcast_v2f16_to_v2bf16(<2 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f16_to_v2bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB30_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB30_4 -; GCN-NEXT: .LBB30_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB30_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB30_2 -; GCN-NEXT: .LBB30_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f16_to_v2bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_4 +; SI-NEXT: .LBB60_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB60_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: .LBB60_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v2bf16: ; VI: ; %bb.0: @@ -3587,36 +6896,136 @@ define <2 x bfloat> @bitcast_v2f16_to_v2bf16(<2 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v1, 0x200, v0 ; VI-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f16_to_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + +define inreg <2 x bfloat> @bitcast_v2f16_to_v2bf16_scalar(<2 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f16_to_v2bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s17 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB61_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: s_cbranch_execnz .LBB61_3 +; SI-NEXT: .LBB61_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB61_2 +; +; VI-LABEL: bitcast_v2f16_to_v2bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB61_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB61_4 +; VI-NEXT: .LBB61_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_e32 v1, s16, v0 +; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB61_3: +; VI-NEXT: s_branch .LBB61_2 +; VI-NEXT: .LBB61_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v2f16_to_v2bf16: +; GFX9-LABEL: bitcast_v2f16_to_v2bf16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: s_movk_i32 s6, 0x200 -; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB61_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB61_4 +; GFX9-NEXT: .LBB61_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB61_3: +; GFX9-NEXT: s_branch .LBB61_2 +; GFX9-NEXT: .LBB61_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v2f16_to_v2bf16: +; GFX11-LABEL: bitcast_v2f16_to_v2bf16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB61_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB61_4 +; GFX11-NEXT: .LBB61_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB61_3: +; GFX11-NEXT: s_branch .LBB61_2 +; GFX11-NEXT: .LBB61_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -3636,43 +7045,43 @@ end: } define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v2bf16_to_v2f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB31_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB31_4 -; GCN-NEXT: .LBB31_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB31_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB31_2 -; GCN-NEXT: .LBB31_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2bf16_to_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB62_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB62_4 +; SI-NEXT: .LBB62_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB62_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_2 +; SI-NEXT: .LBB62_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2bf16_to_v2f16: ; VI: ; %bb.0: @@ -3681,7 +7090,7 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB31_2 +; VI-NEXT: s_cbranch_execz .LBB62_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -3701,7 +7110,7 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; VI-NEXT: .LBB31_2: ; %end +; VI-NEXT: .LBB62_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3712,7 +7121,7 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB31_2 +; GFX9-NEXT: s_cbranch_execz .LBB62_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -3731,7 +7140,7 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s6, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6 -; GFX9-NEXT: .LBB31_2: ; %end +; GFX9-NEXT: .LBB62_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3743,7 +7152,7 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB31_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -3764,7 +7173,7 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 -; GFX11-TRUE16-NEXT: .LBB31_2: ; %end +; GFX11-TRUE16-NEXT: .LBB62_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3776,7 +7185,7 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB31_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB62_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -3796,7 +7205,7 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB31_2: ; %end +; GFX11-FAKE16-NEXT: .LBB62_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3816,41 +7225,198 @@ end: ret <2 x half> %phi } +define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2bf16_to_v2f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s17 +; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: .LBB63_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB63_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB63_2 +; +; VI-LABEL: bitcast_v2bf16_to_v2f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB63_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB63_4 +; VI-NEXT: .LBB63_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB63_3: +; VI-NEXT: s_branch .LBB63_2 +; VI-NEXT: .LBB63_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2bf16_to_v2f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB63_4 +; GFX9-NEXT: .LBB63_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB63_3: +; GFX9-NEXT: s_branch .LBB63_2 +; GFX9-NEXT: .LBB63_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2bf16_to_v2f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB63_4 +; GFX11-NEXT: .LBB63_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB63_3: +; GFX11-NEXT: s_branch .LBB63_2 +; GFX11-NEXT: .LBB63_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + define <1 x i32> @bitcast_v2f16_to_v1i32(<2 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f16_to_v1i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_4 -; GCN-NEXT: .LBB32_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB32_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB32_2 -; GCN-NEXT: .LBB32_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f16_to_v1i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_4 +; SI-NEXT: .LBB64_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB64_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_2 +; SI-NEXT: .LBB64_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v1i32: ; VI: ; %bb.0: @@ -3912,37 +7478,136 @@ end: ret <1 x i32> %phi } +define inreg <1 x i32> @bitcast_v2f16_to_v1i32_scalar(<2 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f16_to_v1i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: .LBB65_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: .LBB65_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB65_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB65_2 +; +; VI-LABEL: bitcast_v2f16_to_v1i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB65_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB65_4 +; VI-NEXT: .LBB65_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB65_3: +; VI-NEXT: s_branch .LBB65_2 +; VI-NEXT: .LBB65_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_v1i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB65_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB65_4 +; GFX9-NEXT: .LBB65_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB65_3: +; GFX9-NEXT: s_branch .LBB65_2 +; GFX9-NEXT: .LBB65_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f16_to_v1i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB65_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB65_4 +; GFX11-NEXT: .LBB65_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB65_3: +; GFX11-NEXT: s_branch .LBB65_2 +; GFX11-NEXT: .LBB65_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} + define <2 x half> @bitcast_v1i32_to_v2f16(<1 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v1i32_to_v2f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB33_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB33_4 -; GCN-NEXT: .LBB33_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB33_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB33_2 -; GCN-NEXT: .LBB33_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v1i32_to_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB66_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB66_4 +; SI-NEXT: .LBB66_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB66_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_2 +; SI-NEXT: .LBB66_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v1i32_to_v2f16: ; VI: ; %bb.0: @@ -4000,46 +7665,135 @@ end: ret <2 x half> %phi } +define inreg <2 x half> @bitcast_v1i32_to_v2f16_scalar(<1 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v1i32_to_v2f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB67_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB67_3 +; SI-NEXT: .LBB67_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB67_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB67_2 +; +; VI-LABEL: bitcast_v1i32_to_v2f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB67_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB67_3 +; VI-NEXT: .LBB67_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB67_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB67_4: +; VI-NEXT: s_branch .LBB67_2 +; +; GFX9-LABEL: bitcast_v1i32_to_v2f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB67_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB67_3 +; GFX9-NEXT: .LBB67_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB67_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB67_4: +; GFX9-NEXT: s_branch .LBB67_2 +; +; GFX11-LABEL: bitcast_v1i32_to_v2f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB67_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB67_3: +; GFX11-NEXT: .LBB67_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f16_to_v4i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB34_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB34_4 -; GCN-NEXT: .LBB34_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB34_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: v_bfe_u32 v3, v2, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_2 -; GCN-NEXT: .LBB34_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: v_bfe_u32 v3, v2, 8, 8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f16_to_v4i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB68_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB68_4 +; SI-NEXT: .LBB68_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB68_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_bfe_u32 v3, v2, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_2 +; SI-NEXT: .LBB68_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_bfe_u32 v3, v2, 8, 8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v4i8: ; VI: ; %bb.0: @@ -4050,19 +7804,19 @@ define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB34_3 +; VI-NEXT: s_cbranch_execnz .LBB68_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB34_4 -; VI-NEXT: .LBB34_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB68_4 +; VI-NEXT: .LBB68_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB34_3: ; %cmp.false +; VI-NEXT: .LBB68_3: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB34_2 -; VI-NEXT: .LBB34_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB68_2 +; VI-NEXT: .LBB68_4: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v1, 0x200 ; VI-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 @@ -4082,20 +7836,20 @@ define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB34_3 +; GFX9-NEXT: s_cbranch_execnz .LBB68_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB34_4 -; GFX9-NEXT: .LBB34_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB68_4 +; GFX9-NEXT: .LBB68_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB34_3: ; %cmp.false +; GFX9-NEXT: .LBB68_3: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB34_2 -; GFX9-NEXT: .LBB34_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB68_2 +; GFX9-NEXT: .LBB68_4: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -4141,20 +7895,20 @@ define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB68_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_4 -; GFX11-FAKE16-NEXT: .LBB34_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB68_4 +; GFX11-FAKE16-NEXT: .LBB68_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB34_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB68_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 -; GFX11-FAKE16-NEXT: .LBB34_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB68_2 +; GFX11-FAKE16-NEXT: .LBB68_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -4179,51 +7933,221 @@ end: ret <4 x i8> %phi } +define inreg <4 x i8> @bitcast_v2f16_to_v4i8_scalar(<2 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f16_to_v4i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_bfe_u32 v3, v2, 8, 8 +; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: .LBB69_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_bfe_u32 v3, v2, 8, 8 +; SI-NEXT: .LBB69_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB69_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB69_2 +; +; VI-LABEL: bitcast_v2f16_to_v4i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB69_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s6, s16, 24 +; VI-NEXT: s_lshr_b32 s8, s16, 16 +; VI-NEXT: s_lshr_b32 s7, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB69_4 +; VI-NEXT: .LBB69_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_add_f16_e32 v2, s4, v0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 8, 8 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB69_3: +; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB69_2 +; VI-NEXT: .LBB69_4: +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_v4i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB69_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s7, s16, 24 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_lshr_b32 s8, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB69_4 +; GFX9-NEXT: .LBB69_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB69_3: +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr7 +; GFX9-NEXT: s_branch .LBB69_2 +; GFX9-NEXT: .LBB69_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v2f16_to_v4i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB69_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB69_4 +; GFX11-TRUE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB69_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB69_2 +; GFX11-TRUE16-NEXT: .LBB69_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v2f16_to_v4i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB69_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB69_4 +; GFX11-FAKE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB69_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: s_branch .LBB69_2 +; GFX11-FAKE16-NEXT: .LBB69_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i8_to_v2f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB35_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB35_4 -; GCN-NEXT: .LBB35_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB35_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB35_2 -; GCN-NEXT: .LBB35_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v3, v0 -; GCN-NEXT: v_or_b32_e32 v1, v4, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i8_to_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_4 +; SI-NEXT: .LBB70_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB70_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_2 +; SI-NEXT: .LBB70_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i8_to_v2f16: ; VI: ; %bb.0: @@ -4235,14 +8159,14 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: s_cbranch_execnz .LBB70_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB35_4 -; VI-NEXT: .LBB35_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB70_4 +; VI-NEXT: .LBB70_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB35_3: ; %cmp.false +; VI-NEXT: .LBB70_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4251,8 +8175,8 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB35_2 -; VI-NEXT: .LBB35_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB70_2 +; VI-NEXT: .LBB70_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v5 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -4274,14 +8198,14 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: s_cbranch_execnz .LBB70_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB35_4 -; GFX9-NEXT: .LBB35_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB70_4 +; GFX9-NEXT: .LBB70_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB35_3: ; %cmp.false +; GFX9-NEXT: .LBB70_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4290,8 +8214,8 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB35_2 -; GFX9-NEXT: .LBB35_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB70_2 +; GFX9-NEXT: .LBB70_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -4313,14 +8237,14 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-TRUE16-NEXT: .LBB35_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-TRUE16-NEXT: .LBB70_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 @@ -4336,8 +8260,8 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-TRUE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v2.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -4367,14 +8291,14 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-FAKE16-NEXT: .LBB35_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-FAKE16-NEXT: .LBB70_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 @@ -4390,8 +8314,8 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-FAKE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-FAKE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v5, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v2, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -4427,39 +8351,207 @@ end: ret <2 x half> %phi } +define inreg <2 x half> @bitcast_v4i8_to_v2f16_scalar(<4 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i8_to_v2f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB71_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_cbranch_execnz .LBB71_3 +; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s16, 0xff +; SI-NEXT: s_lshl_b32 s6, s17, 8 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB71_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB71_2 +; +; VI-LABEL: bitcast_v4i8_to_v2f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB71_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_cbranch_execnz .LBB71_3 +; VI-NEXT: .LBB71_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_and_b32 s6, s16, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s4, 0x3000000 +; VI-NEXT: .LBB71_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB71_4: +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB71_2 +; +; GFX9-LABEL: bitcast_v4i8_to_v2f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_cbranch_execnz .LBB71_3 +; GFX9-NEXT: .LBB71_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: .LBB71_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB71_4: +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB71_2 +; +; GFX11-LABEL: bitcast_v4i8_to_v2f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s8 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-NEXT: .LBB71_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: .LBB71_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB71_4: +; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: s_branch .LBB71_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v2bf16_to_v1i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB36_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB36_4 -; GCN-NEXT: .LBB36_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB36_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_2 -; GCN-NEXT: .LBB36_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2bf16_to_v1i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB72_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB72_4 +; SI-NEXT: .LBB72_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB72_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB72_2 +; SI-NEXT: .LBB72_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2bf16_to_v1i32: ; VI: ; %bb.0: @@ -4468,7 +8560,7 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: s_cbranch_execz .LBB72_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -4488,7 +8580,7 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: .LBB72_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4499,7 +8591,7 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: s_cbranch_execz .LBB72_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -4518,7 +8610,7 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s6, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6 -; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: .LBB72_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4530,7 +8622,7 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -4551,7 +8643,7 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 -; GFX11-TRUE16-NEXT: .LBB36_2: ; %end +; GFX11-TRUE16-NEXT: .LBB72_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4563,7 +8655,7 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -4583,7 +8675,7 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB36_2: ; %end +; GFX11-FAKE16-NEXT: .LBB72_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4603,35 +8695,187 @@ end: ret <1 x i32> %phi } +define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2bf16_to_v1i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: s_cbranch_scc0 .LBB73_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: s_cbranch_execnz .LBB73_3 +; SI-NEXT: .LBB73_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: .LBB73_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB73_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB73_2 +; +; VI-LABEL: bitcast_v2bf16_to_v1i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB73_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB73_4 +; VI-NEXT: .LBB73_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB73_3: +; VI-NEXT: s_branch .LBB73_2 +; VI-NEXT: .LBB73_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2bf16_to_v1i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB73_4 +; GFX9-NEXT: .LBB73_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB73_3: +; GFX9-NEXT: s_branch .LBB73_2 +; GFX9-NEXT: .LBB73_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2bf16_to_v1i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB73_4 +; GFX11-NEXT: .LBB73_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB73_3: +; GFX11-NEXT: s_branch .LBB73_2 +; GFX11-NEXT: .LBB73_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} + define <2 x bfloat> @bitcast_v1i32_to_v2bf16(<1 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v1i32_to_v2bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_4 -; GCN-NEXT: .LBB37_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB37_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB37_2 -; GCN-NEXT: .LBB37_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v1i32_to_v2bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB74_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB74_4 +; SI-NEXT: .LBB74_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB74_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_2 +; SI-NEXT: .LBB74_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v1i32_to_v2bf16: ; VI: ; %bb.0: @@ -4689,46 +8933,134 @@ end: ret <2 x bfloat> %phi } +define inreg <2 x bfloat> @bitcast_v1i32_to_v2bf16_scalar(<1 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v1i32_to_v2bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB75_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB75_3 +; SI-NEXT: .LBB75_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: .LBB75_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB75_4: +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB75_2 +; +; VI-LABEL: bitcast_v1i32_to_v2bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB75_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB75_3 +; VI-NEXT: .LBB75_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB75_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB75_4: +; VI-NEXT: s_branch .LBB75_2 +; +; GFX9-LABEL: bitcast_v1i32_to_v2bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB75_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB75_3 +; GFX9-NEXT: .LBB75_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB75_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB75_4: +; GFX9-NEXT: s_branch .LBB75_2 +; +; GFX11-LABEL: bitcast_v1i32_to_v2bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB75_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB75_3: +; GFX11-NEXT: .LBB75_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v2bf16_to_v4i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB38_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB38_4 -; GCN-NEXT: .LBB38_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB38_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GCN-NEXT: v_alignbit_b32 v0, v2, v5, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_2 -; GCN-NEXT: .LBB38_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2bf16_to_v4i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB76_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB76_4 +; SI-NEXT: .LBB76_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB76_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v2, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_2 +; SI-NEXT: .LBB76_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2bf16_to_v4i8: ; VI: ; %bb.0: @@ -4739,20 +9071,20 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB38_3 +; VI-NEXT: s_cbranch_execnz .LBB76_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB38_4 -; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB76_4 +; VI-NEXT: .LBB76_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB38_3: ; %cmp.false +; VI-NEXT: .LBB76_3: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB38_2 -; VI-NEXT: .LBB38_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB76_2 +; VI-NEXT: .LBB76_4: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -4786,20 +9118,20 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB38_3 +; GFX9-NEXT: s_cbranch_execnz .LBB76_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB38_4 -; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB76_4 +; GFX9-NEXT: .LBB76_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB38_3: ; %cmp.false +; GFX9-NEXT: .LBB76_3: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB38_2 -; GFX9-NEXT: .LBB38_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB76_2 +; GFX9-NEXT: .LBB76_4: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -4845,7 +9177,7 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB76_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4868,7 +9200,7 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX11-TRUE16-NEXT: .LBB38_4: ; %end +; GFX11-TRUE16-NEXT: .LBB76_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h @@ -4884,20 +9216,20 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB76_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_4 -; GFX11-FAKE16-NEXT: .LBB38_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB76_4 +; GFX11-FAKE16-NEXT: .LBB76_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB38_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB76_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB38_2 -; GFX11-FAKE16-NEXT: .LBB38_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB76_2 +; GFX11-FAKE16-NEXT: .LBB76_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4939,51 +9271,298 @@ end: ret <4 x i8> %phi } +define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2bf16_to_v4i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v2, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: .LBB77_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB77_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB77_2 +; +; VI-LABEL: bitcast_v2bf16_to_v4i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB77_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s7, s16, 24 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB77_4 +; VI-NEXT: .LBB77_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB77_3: +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: s_branch .LBB77_2 +; VI-NEXT: .LBB77_4: +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2bf16_to_v4i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s6, s16, 24 +; GFX9-NEXT: s_lshr_b32 s8, s16, 16 +; GFX9-NEXT: s_lshr_b32 s7, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB77_4 +; GFX9-NEXT: .LBB77_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB77_3: +; GFX9-NEXT: ; implicit-def: $sgpr7 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB77_2 +; GFX9-NEXT: .LBB77_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v4i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB77_4 +; GFX11-TRUE16-NEXT: .LBB77_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB77_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB77_2 +; GFX11-TRUE16-NEXT: .LBB77_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v4i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB77_4 +; GFX11-FAKE16-NEXT: .LBB77_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB77_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: s_branch .LBB77_2 +; GFX11-FAKE16-NEXT: .LBB77_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i8_to_v2bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB39_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB39_4 -; GCN-NEXT: .LBB39_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB39_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB39_2 -; GCN-NEXT: .LBB39_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v3, v0 -; GCN-NEXT: v_or_b32_e32 v0, v4, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i8_to_v2bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB78_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB78_4 +; SI-NEXT: .LBB78_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB78_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_2 +; SI-NEXT: .LBB78_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i8_to_v2bf16: ; VI: ; %bb.0: @@ -4995,14 +9574,14 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: s_cbranch_execnz .LBB78_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB39_4 -; VI-NEXT: .LBB39_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB78_4 +; VI-NEXT: .LBB78_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB39_3: ; %cmp.false +; VI-NEXT: .LBB78_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -5011,8 +9590,8 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB39_2 -; VI-NEXT: .LBB39_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB78_2 +; VI-NEXT: .LBB78_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v5 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -5034,14 +9613,14 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: s_cbranch_execnz .LBB78_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB39_4 -; GFX9-NEXT: .LBB39_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB78_4 +; GFX9-NEXT: .LBB78_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB39_3: ; %cmp.false +; GFX9-NEXT: .LBB78_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -5050,8 +9629,8 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB39_2 -; GFX9-NEXT: .LBB39_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB78_2 +; GFX9-NEXT: .LBB78_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -5073,14 +9652,14 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB78_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_4 -; GFX11-TRUE16-NEXT: .LBB39_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB78_4 +; GFX11-TRUE16-NEXT: .LBB78_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB39_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB78_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 @@ -5096,8 +9675,8 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB39_2 -; GFX11-TRUE16-NEXT: .LBB39_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2 +; GFX11-TRUE16-NEXT: .LBB78_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v2.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5127,14 +9706,14 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB78_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_4 -; GFX11-FAKE16-NEXT: .LBB39_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB78_4 +; GFX11-FAKE16-NEXT: .LBB78_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB39_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB78_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 @@ -5150,8 +9729,8 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB39_2 -; GFX11-FAKE16-NEXT: .LBB39_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB78_2 +; GFX11-FAKE16-NEXT: .LBB78_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v5, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v2, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5187,36 +9766,206 @@ end: ret <2 x bfloat> %phi } +define inreg <2 x bfloat> @bitcast_v4i8_to_v2bf16_scalar(<4 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i8_to_v2bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s17, 24 +; SI-NEXT: s_or_b32 s6, s5, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: .LBB79_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshl_b32 s4, s19, 24 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s17, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s6, s5, 0x3000000 +; SI-NEXT: s_add_i32 s7, s4, 0x3000000 +; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB79_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: s_branch .LBB79_2 +; +; VI-LABEL: bitcast_v4i8_to_v2bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB79_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_cbranch_execnz .LBB79_3 +; VI-NEXT: .LBB79_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_and_b32 s6, s16, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s4, 0x3000000 +; VI-NEXT: .LBB79_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB79_4: +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB79_2 +; +; GFX9-LABEL: bitcast_v4i8_to_v2bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_cbranch_execnz .LBB79_3 +; GFX9-NEXT: .LBB79_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: .LBB79_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB79_4: +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB79_2 +; +; GFX11-LABEL: bitcast_v4i8_to_v2bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s8 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB79_3 +; GFX11-NEXT: .LBB79_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: .LBB79_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB79_4: +; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: s_branch .LBB79_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + define <4 x i8> @bitcast_v1i32_to_v4i8(<1 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v1i32_to_v4i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB40_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB40_4 -; GCN-NEXT: .LBB40_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB40_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_2 -; GCN-NEXT: .LBB40_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v1i32_to_v4i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB80_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB80_4 +; SI-NEXT: .LBB80_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB80_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_2 +; SI-NEXT: .LBB80_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v1i32_to_v4i8: ; VI: ; %bb.0: @@ -5227,20 +9976,20 @@ define <4 x i8> @bitcast_v1i32_to_v4i8(<1 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB40_3 +; VI-NEXT: s_cbranch_execnz .LBB80_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB40_4 -; VI-NEXT: .LBB40_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB80_4 +; VI-NEXT: .LBB80_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB40_3: ; %cmp.false +; VI-NEXT: .LBB80_3: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB40_2 -; VI-NEXT: .LBB40_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB80_2 +; VI-NEXT: .LBB80_4: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 @@ -5257,20 +10006,20 @@ define <4 x i8> @bitcast_v1i32_to_v4i8(<1 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB40_3 +; GFX9-NEXT: s_cbranch_execnz .LBB80_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB40_4 -; GFX9-NEXT: .LBB40_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB80_4 +; GFX9-NEXT: .LBB80_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB40_3: ; %cmp.false +; GFX9-NEXT: .LBB80_3: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB40_2 -; GFX9-NEXT: .LBB40_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB80_2 +; GFX9-NEXT: .LBB80_4: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 @@ -5315,20 +10064,20 @@ define <4 x i8> @bitcast_v1i32_to_v4i8(<1 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB80_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_4 -; GFX11-FAKE16-NEXT: .LBB40_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB80_4 +; GFX11-FAKE16-NEXT: .LBB80_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB40_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB80_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 -; GFX11-FAKE16-NEXT: .LBB40_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB80_2 +; GFX11-FAKE16-NEXT: .LBB80_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -5353,52 +10102,209 @@ end: ret <4 x i8> %phi } +define inreg <4 x i8> @bitcast_v1i32_to_v4i8_scalar(<1 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v1i32_to_v4i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB81_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s6, s16, 24 +; SI-NEXT: s_lshr_b32 s7, s16, 16 +; SI-NEXT: s_lshr_b32 s8, s16, 8 +; SI-NEXT: s_cbranch_execnz .LBB81_3 +; SI-NEXT: .LBB81_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b32 s7, s16, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 24 +; SI-NEXT: s_lshr_b32 s8, s16, 8 +; SI-NEXT: .LBB81_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB81_4: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB81_2 +; +; VI-LABEL: bitcast_v1i32_to_v4i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB81_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s6, s16, 24 +; VI-NEXT: s_lshr_b32 s7, s16, 16 +; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB81_3 +; VI-NEXT: .LBB81_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s7, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 24 +; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: .LBB81_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB81_4: +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB81_2 +; +; GFX9-LABEL: bitcast_v1i32_to_v4i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB81_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s6, s16, 24 +; GFX9-NEXT: s_lshr_b32 s7, s16, 16 +; GFX9-NEXT: s_lshr_b32 s8, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB81_3 +; GFX9-NEXT: .LBB81_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s7, s16, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 24 +; GFX9-NEXT: s_lshr_b32 s8, s16, 8 +; GFX9-NEXT: .LBB81_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB81_4: +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr7 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB81_2 +; +; GFX11-TRUE16-LABEL: bitcast_v1i32_to_v4i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB81_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB81_3 +; GFX11-TRUE16-NEXT: .LBB81_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-TRUE16-NEXT: .LBB81_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB81_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr1_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB81_2 +; +; GFX11-FAKE16-LABEL: bitcast_v1i32_to_v4i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB81_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB81_3 +; GFX11-FAKE16-NEXT: .LBB81_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-FAKE16-NEXT: .LBB81_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB81_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr1 +; GFX11-FAKE16-NEXT: s_branch .LBB81_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i8_to_v1i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB41_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB41_4 -; GCN-NEXT: .LBB41_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB41_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB41_2 -; GCN-NEXT: .LBB41_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i8_to_v1i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB82_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB82_4 +; SI-NEXT: .LBB82_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB82_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_2 +; SI-NEXT: .LBB82_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i8_to_v1i32: ; VI: ; %bb.0: @@ -5410,14 +10316,14 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: s_cbranch_execnz .LBB82_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB41_4 -; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB82_4 +; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB41_3: ; %cmp.false +; VI-NEXT: .LBB82_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -5426,8 +10332,8 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB41_2 -; VI-NEXT: .LBB41_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB82_2 +; VI-NEXT: .LBB82_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v5 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -5449,14 +10355,14 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: s_cbranch_execnz .LBB82_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB41_4 -; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB82_4 +; GFX9-NEXT: .LBB82_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB41_3: ; %cmp.false +; GFX9-NEXT: .LBB82_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -5465,8 +10371,8 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB41_2 -; GFX9-NEXT: .LBB41_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB82_2 +; GFX9-NEXT: .LBB82_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -5488,14 +10394,14 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB82_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_4 -; GFX11-TRUE16-NEXT: .LBB41_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB82_4 +; GFX11-TRUE16-NEXT: .LBB82_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB41_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB82_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 @@ -5511,8 +10417,8 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB41_2 -; GFX11-TRUE16-NEXT: .LBB41_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 +; GFX11-TRUE16-NEXT: .LBB82_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v2.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5542,14 +10448,14 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB82_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_4 -; GFX11-FAKE16-NEXT: .LBB41_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB82_4 +; GFX11-FAKE16-NEXT: .LBB82_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB41_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB82_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 @@ -5565,8 +10471,8 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB41_2 -; GFX11-FAKE16-NEXT: .LBB41_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB82_2 +; GFX11-FAKE16-NEXT: .LBB82_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v5, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v2, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5601,3 +10507,173 @@ end: %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <1 x i32> %phi } + +define inreg <1 x i32> @bitcast_v4i8_to_v1i32_scalar(<4 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i8_to_v1i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB83_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB83_3 +; SI-NEXT: .LBB83_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s6, s4, 0x3000000 +; SI-NEXT: .LBB83_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB83_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB83_2 +; +; VI-LABEL: bitcast_v4i8_to_v1i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB83_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_cbranch_execnz .LBB83_3 +; VI-NEXT: .LBB83_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_and_b32 s6, s16, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s4, 0x3000000 +; VI-NEXT: .LBB83_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB83_4: +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB83_2 +; +; GFX9-LABEL: bitcast_v4i8_to_v1i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB83_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_cbranch_execnz .LBB83_3 +; GFX9-NEXT: .LBB83_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: .LBB83_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB83_4: +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB83_2 +; +; GFX11-LABEL: bitcast_v4i8_to_v1i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB83_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s8 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB83_3 +; GFX11-NEXT: .LBB83_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: .LBB83_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB83_4: +; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: s_branch .LBB83_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll index 2a7315c055a54..5842662481e5d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll @@ -1,34 +1,35 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <11 x float> @bitcast_v11i32_to_v11f32(<11 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v11i32_to_v11f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11i32_to_v11f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i32_to_v11f32: ; VI: ; %bb.0: @@ -119,30 +120,185 @@ end: ret <11 x float> %phi } +define inreg <11 x float> @bitcast_v11i32_to_v11f32_scalar(<11 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11i32_to_v11f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s27, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v11i32_to_v11f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v11i32_to_v11f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v11i32_to_v11f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <11 x i32> %a, splat (i32 3) + %a2 = bitcast <11 x i32> %a1 to <11 x float> + br label %end + +cmp.false: + %a3 = bitcast <11 x i32> %a to <11 x float> + br label %end + +end: + %phi = phi <11 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x float> %phi +} + define <11 x i32> @bitcast_v11f32_to_v11i32(<11 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v11f32_to_v11i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11f32_to_v11i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f32_to_v11i32: ; VI: ; %bb.0: @@ -151,7 +307,7 @@ define <11 x i32> @bitcast_v11f32_to_v11i32(<11 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -164,7 +320,7 @@ define <11 x i32> @bitcast_v11f32_to_v11i32(<11 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -175,7 +331,7 @@ define <11 x i32> @bitcast_v11f32_to_v11i32(<11 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 ; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -188,7 +344,7 @@ define <11 x i32> @bitcast_v11f32_to_v11i32(<11 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -227,80 +383,242 @@ end: ret <11 x i32> %phi } +define inreg <11 x i32> @bitcast_v11f32_to_v11i32_scalar(<11 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11f32_to_v11i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s27, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v11f32_to_v11i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v11f32_to_v11i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v11f32_to_v11i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <11 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <11 x float> %a1 to <11 x i32> + br label %end + +cmp.false: + %a3 = bitcast <11 x float> %a to <11 x i32> + br label %end + +end: + %phi = phi <11 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i32> %phi +} + define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v11i32_to_v22i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB2_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, s4, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v21, s4, v20, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11i32_to_v22i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_4 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB4_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, s4, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: .LBB4_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v21, s4, v20, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i32_to_v22i16: ; VI: ; %bb.0: @@ -309,7 +627,7 @@ define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 ; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 @@ -322,7 +640,7 @@ define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -333,7 +651,7 @@ define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 ; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 @@ -346,7 +664,7 @@ define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -358,7 +676,7 @@ define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 ; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 @@ -371,7 +689,7 @@ define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -391,133 +709,338 @@ end: ret <22 x i16> %phi } +define inreg <22 x i16> @bitcast_v11i32_to_v22i16_scalar(<11 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11i32_to_v22i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s27, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s4, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s6 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v11i32_to_v22i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v11i32_to_v22i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v11i32_to_v22i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <11 x i32> %a, splat (i32 3) + %a2 = bitcast <11 x i32> %a1 to <22 x i16> + br label %end + +cmp.false: + %a3 = bitcast <11 x i32> %a to <22 x i16> + br label %end + +end: + %phi = phi <22 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x i16> %phi +} + define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v22i16_to_v11i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v28, v10 -; GCN-NEXT: v_mov_b32_e32 v27, v8 -; GCN-NEXT: v_mov_b32_e32 v26, v6 -; GCN-NEXT: v_mov_b32_e32 v25, v4 -; GCN-NEXT: v_mov_b32_e32 v24, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v0, v0, v22 -; GCN-NEXT: v_or_b32_e32 v1, v1, v29 -; GCN-NEXT: v_or_b32_e32 v2, v2, v30 -; GCN-NEXT: v_or_b32_e32 v3, v3, v31 -; GCN-NEXT: v_or_b32_e32 v4, v4, v32 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v23 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v22, v0 -; GCN-NEXT: v_or_b32_e32 v1, v29, v1 -; GCN-NEXT: v_or_b32_e32 v2, v30, v2 -; GCN-NEXT: v_or_b32_e32 v3, v31, v3 -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22i16_to_v11i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v10 +; SI-NEXT: v_mov_b32_e32 v27, v8 +; SI-NEXT: v_mov_b32_e32 v26, v6 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v24, v2 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v15 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v31, v4 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 +; SI-NEXT: v_or_b32_e32 v7, v22, v7 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i16_to_v11i32: ; VI: ; %bb.0: @@ -526,7 +1049,7 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 3 ; VI-NEXT: v_add_u16_e32 v11, 3, v10 @@ -562,7 +1085,7 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v11, v0 -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -573,7 +1096,7 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] @@ -586,7 +1109,7 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -598,7 +1121,7 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] @@ -611,7 +1134,7 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -631,147 +1154,433 @@ end: ret <11 x i32> %phi } +define inreg <11 x i32> @bitcast_v22i16_to_v11i32_scalar(<22 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22i16_to_v11i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mov_b32_e32 v11, v6 +; SI-NEXT: v_mov_b32_e32 v12, v4 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v7 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v8, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v9, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v10, v0, v15 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v22i16_to_v11i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_add_i32 s25, s26, 3 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s26, s24, 0x30000 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v22i16_to_v11i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v22i16_to_v11i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: s_branch .LBB7_2 +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <22 x i16> %a, splat (i16 3) + %a2 = bitcast <22 x i16> %a1 to <11 x i32> + br label %end + +cmp.false: + %a3 = bitcast <22 x i16> %a to <11 x i32> + br label %end + +end: + %phi = phi <11 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i32> %phi +} + define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v11i32_to_v22f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v32, v10 -; GCN-NEXT: v_mov_b32_e32 v31, v9 -; GCN-NEXT: v_mov_b32_e32 v30, v8 -; GCN-NEXT: v_mov_b32_e32 v29, v7 -; GCN-NEXT: v_mov_b32_e32 v28, v6 -; GCN-NEXT: v_mov_b32_e32 v27, v5 -; GCN-NEXT: v_mov_b32_e32 v26, v4 -; GCN-NEXT: v_mov_b32_e32 v25, v3 -; GCN-NEXT: v_mov_b32_e32 v24, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NEXT: v_mov_b32_e32 v22, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_4 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB4_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: .LBB4_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11i32_to_v22f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v31, v9 +; SI-NEXT: v_mov_b32_e32 v30, v8 +; SI-NEXT: v_mov_b32_e32 v29, v7 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v27, v5 +; SI-NEXT: v_mov_b32_e32 v26, v4 +; SI-NEXT: v_mov_b32_e32 v25, v3 +; SI-NEXT: v_mov_b32_e32 v24, v2 +; SI-NEXT: v_mov_b32_e32 v23, v1 +; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_4 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB8_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: .LBB8_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i32_to_v22f16: ; VI: ; %bb.0: @@ -780,7 +1589,7 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 ; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 @@ -793,7 +1602,7 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -804,7 +1613,7 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 ; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 @@ -817,7 +1626,7 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -829,7 +1638,7 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 ; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 @@ -842,7 +1651,7 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -862,181 +1671,413 @@ end: ret <22 x half> %phi } +define inreg <22 x half> @bitcast_v11i32_to_v22f16_scalar(<11 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11i32_to_v22f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s27, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s5, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v11i32_to_v22f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v11i32_to_v22f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v11i32_to_v22f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <11 x i32> %a, splat (i32 3) + %a2 = bitcast <11 x i32> %a1 to <22 x half> + br label %end + +cmp.false: + %a3 = bitcast <11 x i32> %a to <22 x half> + br label %end + +end: + %phi = phi <22 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x half> %phi +} + define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v22f16_to_v11i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v20 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_4 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB5_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v14 -; GCN-NEXT: v_or_b32_e32 v0, v32, v0 -; GCN-NEXT: v_or_b32_e32 v1, v30, v1 -; GCN-NEXT: v_or_b32_e32 v2, v28, v2 -; GCN-NEXT: v_or_b32_e32 v3, v26, v3 -; GCN-NEXT: v_or_b32_e32 v4, v24, v4 -; GCN-NEXT: v_or_b32_e32 v5, v23, v5 -; GCN-NEXT: v_or_b32_e32 v6, v22, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_or_b32_e32 v8, v13, v8 -; GCN-NEXT: v_or_b32_e32 v9, v12, v9 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: .LBB5_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v18, v10 -; GCN-NEXT: v_or_b32_e32 v6, v20, v19 -; GCN-NEXT: v_or_b32_e32 v7, v15, v21 -; GCN-NEXT: v_or_b32_e32 v8, v13, v17 -; GCN-NEXT: v_or_b32_e32 v9, v12, v16 -; GCN-NEXT: v_or_b32_e32 v10, v11, v14 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22f16_to_v11i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v31, v3 +; SI-NEXT: v_or_b32_e32 v4, v29, v4 +; SI-NEXT: v_or_b32_e32 v5, v27, v5 +; SI-NEXT: v_or_b32_e32 v6, v25, v6 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v30 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f16_to_v11i32: ; VI: ; %bb.0: @@ -1045,7 +2086,7 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v11, 0x200 ; VI-NEXT: v_add_f16_sdwa v12, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1081,7 +2122,7 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v12 ; VI-NEXT: v_or_b32_e32 v0, v0, v11 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1092,7 +2133,7 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] @@ -1106,7 +2147,7 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1118,7 +2159,7 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] @@ -1131,7 +2172,7 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1151,80 +2192,399 @@ end: ret <11 x i32> %phi } +define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22f16_to_v11i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v32, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_or_b32_e32 v2, v27, v2 +; SI-NEXT: v_or_b32_e32 v3, v25, v3 +; SI-NEXT: v_or_b32_e32 v4, v23, v4 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: v_or_b32_e32 v6, v19, v6 +; SI-NEXT: v_or_b32_e32 v7, v17, v7 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v22f16_to_v11i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v11, v1 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: v_add_f16_sdwa v11, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v11 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v22f16_to_v11i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v22f16_to_v11i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x half> %a, splat (half 0xH0200) + %a2 = bitcast <22 x half> %a1 to <11 x i32> + br label %end + +cmp.false: + %a3 = bitcast <22 x half> %a to <11 x i32> + br label %end + +end: + %phi = phi <11 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i32> %phi +} + define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v11f32_to_v22i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_4 -; GCN-NEXT: .LBB6_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB6_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, s4, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: .LBB6_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v21, s4, v20, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11f32_to_v22i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_4 +; SI-NEXT: .LBB12_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB12_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, s4, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: .LBB12_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v21, s4, v20, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f32_to_v22i16: ; VI: ; %bb.0: @@ -1233,7 +2593,7 @@ define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -1246,7 +2606,7 @@ define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: .LBB12_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1257,7 +2617,7 @@ define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 ; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -1270,7 +2630,7 @@ define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB6_2: ; %end +; GFX9-NEXT: .LBB12_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1309,133 +2669,351 @@ end: ret <22 x i16> %phi } +define inreg <22 x i16> @bitcast_v11f32_to_v22i16_scalar(<11 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11f32_to_v22i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s27, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_4 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v21, s4, v20, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_branch .LBB13_2 +; SI-NEXT: .LBB13_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v11f32_to_v22i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_4 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_3: +; VI-NEXT: s_branch .LBB13_2 +; VI-NEXT: .LBB13_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v11f32_to_v22i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_4 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_3: +; GFX9-NEXT: s_branch .LBB13_2 +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v11f32_to_v22i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_4 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_3: +; GFX11-NEXT: s_branch .LBB13_2 +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <11 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <11 x float> %a1 to <22 x i16> + br label %end + +cmp.false: + %a3 = bitcast <11 x float> %a to <22 x i16> + br label %end + +end: + %phi = phi <22 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x i16> %phi +} + define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v22i16_to_v11f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v28, v10 -; GCN-NEXT: v_mov_b32_e32 v27, v8 -; GCN-NEXT: v_mov_b32_e32 v26, v6 -; GCN-NEXT: v_mov_b32_e32 v25, v4 -; GCN-NEXT: v_mov_b32_e32 v24, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v0, v0, v22 -; GCN-NEXT: v_or_b32_e32 v1, v1, v29 -; GCN-NEXT: v_or_b32_e32 v2, v2, v30 -; GCN-NEXT: v_or_b32_e32 v3, v3, v31 -; GCN-NEXT: v_or_b32_e32 v4, v4, v32 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v23 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v22, v0 -; GCN-NEXT: v_or_b32_e32 v1, v29, v1 -; GCN-NEXT: v_or_b32_e32 v2, v30, v2 -; GCN-NEXT: v_or_b32_e32 v3, v31, v3 -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22i16_to_v11f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v10 +; SI-NEXT: v_mov_b32_e32 v27, v8 +; SI-NEXT: v_mov_b32_e32 v26, v6 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v24, v2 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v15 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v31, v4 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 +; SI-NEXT: v_or_b32_e32 v7, v22, v7 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i16_to_v11f32: ; VI: ; %bb.0: @@ -1444,7 +3022,7 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 3 ; VI-NEXT: v_add_u16_e32 v11, 3, v10 @@ -1480,7 +3058,7 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v11, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1491,7 +3069,7 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] @@ -1504,7 +3082,7 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_2: ; %end +; GFX9-NEXT: .LBB14_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1516,7 +3094,7 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] @@ -1529,7 +3107,7 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: .LBB14_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1549,147 +3127,433 @@ end: ret <11 x float> %phi } +define inreg <11 x float> @bitcast_v22i16_to_v11f32_scalar(<22 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22i16_to_v11f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mov_b32_e32 v11, v6 +; SI-NEXT: v_mov_b32_e32 v12, v4 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v7 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v8, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v9, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v10, v0, v15 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v22i16_to_v11f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_add_i32 s25, s26, 3 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s26, s24, 0x30000 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v22i16_to_v11f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v22i16_to_v11f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <22 x i16> %a, splat (i16 3) + %a2 = bitcast <22 x i16> %a1 to <11 x float> + br label %end + +cmp.false: + %a3 = bitcast <22 x i16> %a to <11 x float> + br label %end + +end: + %phi = phi <11 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x float> %phi +} + define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v11f32_to_v22f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v32, v10 -; GCN-NEXT: v_mov_b32_e32 v31, v9 -; GCN-NEXT: v_mov_b32_e32 v30, v8 -; GCN-NEXT: v_mov_b32_e32 v29, v7 -; GCN-NEXT: v_mov_b32_e32 v28, v6 -; GCN-NEXT: v_mov_b32_e32 v27, v5 -; GCN-NEXT: v_mov_b32_e32 v26, v4 -; GCN-NEXT: v_mov_b32_e32 v25, v3 -; GCN-NEXT: v_mov_b32_e32 v24, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NEXT: v_mov_b32_e32 v22, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v31 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11f32_to_v22f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v31, v9 +; SI-NEXT: v_mov_b32_e32 v30, v8 +; SI-NEXT: v_mov_b32_e32 v29, v7 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v27, v5 +; SI-NEXT: v_mov_b32_e32 v26, v4 +; SI-NEXT: v_mov_b32_e32 v25, v3 +; SI-NEXT: v_mov_b32_e32 v24, v2 +; SI-NEXT: v_mov_b32_e32 v23, v1 +; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f32_to_v22f16: ; VI: ; %bb.0: @@ -1698,7 +3562,7 @@ define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -1711,7 +3575,7 @@ define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: .LBB16_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1722,7 +3586,7 @@ define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 ; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -1735,7 +3599,7 @@ define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB8_2: ; %end +; GFX9-NEXT: .LBB16_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1774,181 +3638,431 @@ end: ret <22 x half> %phi } +define inreg <22 x half> @bitcast_v11f32_to_v22f16_scalar(<11 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11f32_to_v22f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s27, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s25, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v11f32_to_v22f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_4 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_3: +; VI-NEXT: s_branch .LBB17_2 +; VI-NEXT: .LBB17_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v11f32_to_v22f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_4 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_3: +; GFX9-NEXT: s_branch .LBB17_2 +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v11f32_to_v22f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_4 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_3: +; GFX11-NEXT: s_branch .LBB17_2 +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <11 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <11 x float> %a1 to <22 x half> + br label %end + +cmp.false: + %a3 = bitcast <11 x float> %a to <22 x half> + br label %end + +end: + %phi = phi <22 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x half> %phi +} + define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v22f16_to_v11f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v20 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v14 -; GCN-NEXT: v_or_b32_e32 v0, v32, v0 -; GCN-NEXT: v_or_b32_e32 v1, v30, v1 -; GCN-NEXT: v_or_b32_e32 v2, v28, v2 -; GCN-NEXT: v_or_b32_e32 v3, v26, v3 -; GCN-NEXT: v_or_b32_e32 v4, v24, v4 -; GCN-NEXT: v_or_b32_e32 v5, v23, v5 -; GCN-NEXT: v_or_b32_e32 v6, v22, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_or_b32_e32 v8, v13, v8 -; GCN-NEXT: v_or_b32_e32 v9, v12, v9 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v18, v10 -; GCN-NEXT: v_or_b32_e32 v6, v20, v19 -; GCN-NEXT: v_or_b32_e32 v7, v15, v21 -; GCN-NEXT: v_or_b32_e32 v8, v13, v17 -; GCN-NEXT: v_or_b32_e32 v9, v12, v16 -; GCN-NEXT: v_or_b32_e32 v10, v11, v14 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22f16_to_v11f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v31, v3 +; SI-NEXT: v_or_b32_e32 v4, v29, v4 +; SI-NEXT: v_or_b32_e32 v5, v27, v5 +; SI-NEXT: v_or_b32_e32 v6, v25, v6 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v30 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f16_to_v11f32: ; VI: ; %bb.0: @@ -1957,7 +4071,7 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v11, 0x200 ; VI-NEXT: v_add_f16_sdwa v12, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1993,7 +4107,7 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v12 ; VI-NEXT: v_or_b32_e32 v0, v0, v11 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2004,7 +4118,7 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] @@ -2018,7 +4132,7 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: .LBB18_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2030,7 +4144,7 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] @@ -2043,7 +4157,7 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: .LBB18_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2063,158 +4177,477 @@ end: ret <11 x float> %phi } +define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22f16_to_v11f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v32, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_or_b32_e32 v2, v27, v2 +; SI-NEXT: v_or_b32_e32 v3, v25, v3 +; SI-NEXT: v_or_b32_e32 v4, v23, v4 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: v_or_b32_e32 v6, v19, v6 +; SI-NEXT: v_or_b32_e32 v7, v17, v7 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v22f16_to_v11f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v11, v1 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: v_add_f16_sdwa v11, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v11 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v22f16_to_v11f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v22f16_to_v11f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x half> %a, splat (half 0xH0200) + %a2 = bitcast <22 x half> %a1 to <11 x float> + br label %end + +cmp.false: + %a3 = bitcast <22 x half> %a to <11 x float> + br label %end + +end: + %phi = phi <11 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x float> %phi +} + define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v22i16_to_v22f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v51, v21 -; GCN-NEXT: v_mov_b32_e32 v50, v20 -; GCN-NEXT: v_mov_b32_e32 v49, v19 -; GCN-NEXT: v_mov_b32_e32 v48, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v17 -; GCN-NEXT: v_mov_b32_e32 v38, v16 -; GCN-NEXT: v_mov_b32_e32 v37, v15 -; GCN-NEXT: v_mov_b32_e32 v36, v14 -; GCN-NEXT: v_mov_b32_e32 v35, v13 -; GCN-NEXT: v_mov_b32_e32 v34, v12 -; GCN-NEXT: v_mov_b32_e32 v33, v11 -; GCN-NEXT: v_mov_b32_e32 v32, v10 -; GCN-NEXT: v_mov_b32_e32 v31, v9 -; GCN-NEXT: v_mov_b32_e32 v30, v8 -; GCN-NEXT: v_mov_b32_e32 v29, v7 -; GCN-NEXT: v_mov_b32_e32 v28, v6 -; GCN-NEXT: v_mov_b32_e32 v27, v5 -; GCN-NEXT: v_mov_b32_e32 v26, v4 -; GCN-NEXT: v_mov_b32_e32 v25, v3 -; GCN-NEXT: v_mov_b32_e32 v24, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NEXT: v_mov_b32_e32 v52, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22i16_to_v22f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v21 +; SI-NEXT: v_mov_b32_e32 v50, v20 +; SI-NEXT: v_mov_b32_e32 v49, v19 +; SI-NEXT: v_mov_b32_e32 v48, v18 +; SI-NEXT: v_mov_b32_e32 v39, v17 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v37, v15 +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_mov_b32_e32 v35, v13 +; SI-NEXT: v_mov_b32_e32 v34, v12 +; SI-NEXT: v_mov_b32_e32 v33, v11 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v31, v9 +; SI-NEXT: v_mov_b32_e32 v30, v8 +; SI-NEXT: v_mov_b32_e32 v29, v7 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v27, v5 +; SI-NEXT: v_mov_b32_e32 v26, v4 +; SI-NEXT: v_mov_b32_e32 v25, v3 +; SI-NEXT: v_mov_b32_e32 v24, v2 +; SI-NEXT: v_mov_b32_e32 v23, v1 +; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i16_to_v22f16: ; VI: ; %bb.0: @@ -2223,7 +4656,7 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v11, 3 ; VI-NEXT: v_add_u16_sdwa v19, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -2259,7 +4692,7 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v14 ; VI-NEXT: v_or_b32_e32 v1, v1, v13 ; VI-NEXT: v_or_b32_e32 v0, v0, v12 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2270,7 +4703,7 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] @@ -2283,7 +4716,7 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2295,7 +4728,7 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] @@ -2308,7 +4741,7 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2328,134 +4761,420 @@ end: ret <22 x half> %phi } +define inreg <22 x half> @bitcast_v22i16_to_v22f16_scalar(<22 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22i16_to_v22f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mov_b32_e32 v26, v7 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v22, v3 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v27 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v22i16_to_v22f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_add_i32 s25, s26, 3 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s26, s24, 0x30000 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v22i16_to_v22f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_4 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_3: +; GFX9-NEXT: s_branch .LBB21_2 +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v22i16_to_v22f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: s_branch .LBB21_2 +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <22 x i16> %a, splat (i16 3) + %a2 = bitcast <22 x i16> %a1 to <22 x half> + br label %end + +cmp.false: + %a3 = bitcast <22 x i16> %a to <22 x half> + br label %end + +end: + %phi = phi <22 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x half> %phi +} + define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v22f16_to_v22i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; GCN-NEXT: v_or_b32_e32 v6, v6, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22f16_to_v22i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v18, v18, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v10, v10, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f16_to_v22i16: ; VI: ; %bb.0: @@ -2464,7 +5183,7 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 0x200 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v10 @@ -2500,7 +5219,7 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v14, v2 ; VI-NEXT: v_or_b32_e32 v1, v13, v1 ; VI-NEXT: v_or_b32_e32 v0, v11, v0 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2511,7 +5230,7 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] @@ -2525,7 +5244,7 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2537,7 +5256,7 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] @@ -2550,7 +5269,7 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2569,3 +5288,327 @@ end: %phi = phi <22 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <22 x i16> %phi } + +define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22f16_to_v22i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v7 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v18, v18, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v10, v10, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v22f16_to_v22i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s5, s25, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s5, s26, 16 +; VI-NEXT: v_add_f16_e32 v1, s25, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_lshr_b32 s5, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s24, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s23, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s22, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s21, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v3, s26, v0 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s20, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s19, 16 +; VI-NEXT: v_or_b32_e32 v10, v3, v4 +; VI-NEXT: v_or_b32_e32 v4, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s19, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_add_f16_e32 v1, s18, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v1, v2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_add_f16_sdwa v12, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v11, s16, v0 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s17, v0 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v11, v12 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v22f16_to_v22i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v22f16_to_v22i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: s_branch .LBB23_2 +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x half> %a, splat (half 0xH0200) + %a2 = bitcast <22 x half> %a1 to <22 x i16> + br label %end + +cmp.false: + %a3 = bitcast <22 x half> %a to <22 x i16> + br label %end + +end: + %phi = phi <22 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x i16> %phi +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FAKE16: {{.*}} +; GFX11-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll index 264e2b2bf0122..fe3dd7ddc4174 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll @@ -1,35 +1,36 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <12 x float> @bitcast_v12i32_to_v12f32(<12 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i32_to_v12f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i32_to_v12f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i32_to_v12f32: ; VI: ; %bb.0: @@ -123,31 +124,193 @@ end: ret <12 x float> %phi } +define inreg <12 x float> @bitcast_v12i32_to_v12f32_scalar(<12 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i32_to_v12f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v12i32_to_v12f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v12i32_to_v12f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v12i32_to_v12f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i32> %a, splat (i32 3) + %a2 = bitcast <12 x i32> %a1 to <12 x float> + br label %end + +cmp.false: + %a3 = bitcast <12 x i32> %a to <12 x float> + br label %end + +end: + %phi = phi <12 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x float> %phi +} + define <12 x i32> @bitcast_v12f32_to_v12i32(<12 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f32_to_v12i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f32_to_v12i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v12i32: ; VI: ; %bb.0: @@ -156,7 +319,7 @@ define <12 x i32> @bitcast_v12f32_to_v12i32(<12 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -170,7 +333,7 @@ define <12 x i32> @bitcast_v12f32_to_v12i32(<12 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -181,7 +344,7 @@ define <12 x i32> @bitcast_v12f32_to_v12i32(<12 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -195,7 +358,7 @@ define <12 x i32> @bitcast_v12f32_to_v12i32(<12 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -234,31 +397,200 @@ end: ret <12 x i32> %phi } +define inreg <12 x i32> @bitcast_v12f32_to_v12i32_scalar(<12 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f32_to_v12i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f32_to_v12i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f32_to_v12i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f32_to_v12i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <12 x float> %a1 to <12 x i32> + br label %end + +cmp.false: + %a3 = bitcast <12 x float> %a to <12 x i32> + br label %end + +end: + %phi = phi <12 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i32> %phi +} + define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i32_to_v6f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i32_to_v6f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i32_to_v6f64: ; VI: ; %bb.0: @@ -267,7 +599,7 @@ define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 @@ -281,7 +613,7 @@ define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -292,7 +624,7 @@ define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 ; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 @@ -306,7 +638,7 @@ define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -318,7 +650,7 @@ define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 ; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 @@ -332,7 +664,7 @@ define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -352,25 +684,187 @@ end: ret <6 x double> %phi } +define inreg <6 x double> @bitcast_v12i32_to_v6f64_scalar(<12 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i32_to_v6f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v12i32_to_v6f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v12i32_to_v6f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v12i32_to_v6f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i32> %a, splat (i32 3) + %a2 = bitcast <12 x i32> %a1 to <6 x double> + br label %end + +cmp.false: + %a3 = bitcast <12 x i32> %a to <6 x double> + br label %end + +end: + %phi = phi <6 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x double> %phi +} + define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f64_to_v12i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f64_to_v12i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v12i32: ; VI: ; %bb.0: @@ -379,7 +873,7 @@ define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -387,7 +881,7 @@ define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -398,7 +892,7 @@ define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -406,7 +900,7 @@ define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -418,7 +912,7 @@ define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -426,7 +920,7 @@ define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -446,31 +940,176 @@ end: ret <12 x i32> %phi } +define inreg <12 x i32> @bitcast_v6f64_to_v12i32_scalar(<6 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f64_to_v12i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_4 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_3: +; SI-NEXT: s_branch .LBB7_2 +; SI-NEXT: .LBB7_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f64_to_v12i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_4 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_3: +; VI-NEXT: s_branch .LBB7_2 +; VI-NEXT: .LBB7_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f64_to_v12i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f64_to_v12i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: s_branch .LBB7_2 +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <6 x double> %a1 to <12 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x double> %a to <12 x i32> + br label %end + +end: + %phi = phi <12 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i32> %phi +} + define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i32_to_v6i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i32_to_v6i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i32_to_v6i64: ; VI: ; %bb.0: @@ -479,7 +1118,7 @@ define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 @@ -493,7 +1132,7 @@ define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -504,7 +1143,7 @@ define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 ; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 @@ -518,7 +1157,7 @@ define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -530,7 +1169,7 @@ define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 ; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 @@ -544,7 +1183,7 @@ define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -564,31 +1203,193 @@ end: ret <6 x i64> %phi } +define inreg <6 x i64> @bitcast_v12i32_to_v6i64_scalar(<12 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i32_to_v6i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v12i32_to_v6i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v12i32_to_v6i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v12i32_to_v6i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i32> %a, splat (i32 3) + %a2 = bitcast <12 x i32> %a1 to <6 x i64> + br label %end + +cmp.false: + %a3 = bitcast <12 x i32> %a to <6 x i64> + br label %end + +end: + %phi = phi <6 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i64> %phi +} + define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i64_to_v12i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i64_to_v12i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i64_to_v12i32: ; VI: ; %bb.0: @@ -597,7 +1398,7 @@ define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc @@ -611,7 +1412,7 @@ define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -622,7 +1423,7 @@ define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc @@ -636,7 +1437,7 @@ define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -648,7 +1449,7 @@ define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -665,7 +1466,7 @@ define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -685,82 +1486,244 @@ end: ret <12 x i32> %phi } +define inreg <12 x i32> @bitcast_v6i64_to_v12i32_scalar(<6 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i64_to_v12i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v6i64_to_v12i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v6i64_to_v12i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v6i64_to_v12i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_3 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB11_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: s_branch .LBB11_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i64> %a, splat (i64 3) + %a2 = bitcast <6 x i64> %a1 to <12 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x i64> %a to <12 x i32> + br label %end + +end: + %phi = phi <12 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i32> %phi +} + define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i32_to_v24i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i32_to_v24i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v12, v24 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i32_to_v24i16: ; VI: ; %bb.0: @@ -769,7 +1732,7 @@ define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 @@ -783,7 +1746,7 @@ define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: .LBB12_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -794,7 +1757,7 @@ define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 ; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 @@ -808,7 +1771,7 @@ define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB6_2: ; %end +; GFX9-NEXT: .LBB12_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -820,7 +1783,7 @@ define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 ; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 @@ -834,7 +1797,7 @@ define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB6_2: ; %end +; GFX11-NEXT: .LBB12_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -854,142 +1817,358 @@ end: ret <24 x i16> %phi } +define inreg <24 x i16> @bitcast_v12i32_to_v24i16_scalar(<12 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i32_to_v24i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s8 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s7 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v23, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v12i32_to_v24i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v12i32_to_v24i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_v12i32_to_v24i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i32> %a, splat (i32 3) + %a2 = bitcast <12 x i32> %a1 to <24 x i16> + br label %end + +cmp.false: + %a3 = bitcast <12 x i32> %a to <24 x i16> + br label %end + +end: + %phi = phi <24 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i16> %phi +} + define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i16_to_v12i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v30, v10 -; GCN-NEXT: v_mov_b32_e32 v29, v8 -; GCN-NEXT: v_mov_b32_e32 v28, v6 -; GCN-NEXT: v_mov_b32_e32 v27, v4 -; GCN-NEXT: v_mov_b32_e32 v26, v2 -; GCN-NEXT: v_mov_b32_e32 v25, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v0, v0, v34 -; GCN-NEXT: v_or_b32_e32 v1, v1, v35 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v2, v2, v24 -; GCN-NEXT: v_or_b32_e32 v3, v3, v31 -; GCN-NEXT: v_or_b32_e32 v4, v4, v32 -; GCN-NEXT: v_or_b32_e32 v5, v5, v33 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v25 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_or_b32_e32 v2, v24, v2 -; GCN-NEXT: v_or_b32_e32 v3, v31, v3 -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_or_b32_e32 v5, v33, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i16_to_v12i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v10 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v25, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v23 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v24 +; SI-NEXT: v_or_b32_e32 v9, v9, v17 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_or_b32_e32 v6, v32, v6 +; SI-NEXT: v_or_b32_e32 v7, v31, v7 +; SI-NEXT: v_or_b32_e32 v8, v24, v8 +; SI-NEXT: v_or_b32_e32 v9, v17, v9 +; SI-NEXT: v_or_b32_e32 v10, v15, v10 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i16_to_v12i32: ; VI: ; %bb.0: @@ -998,7 +2177,7 @@ define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 3 ; VI-NEXT: v_add_u16_e32 v12, 3, v11 @@ -1037,7 +2216,7 @@ define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v12, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v12, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1048,7 +2227,7 @@ define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -1062,7 +2241,7 @@ define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_2: ; %end +; GFX9-NEXT: .LBB14_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1074,7 +2253,7 @@ define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -1088,7 +2267,7 @@ define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: .LBB14_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1108,158 +2287,461 @@ end: ret <12 x i32> %phi } +define inreg <12 x i32> @bitcast_v24i16_to_v12i32_scalar(<24 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i16_to_v12i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v12, v8 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_mov_b32_e32 v14, v4 +; SI-NEXT: v_mov_b32_e32 v15, v2 +; SI-NEXT: v_mov_b32_e32 v16, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v9 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v10, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v20 +; SI-NEXT: v_or_b32_e32 v11, v0, v17 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v24i16_to_v12i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s26, 3 +; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s27, s26, 0x30000 +; VI-NEXT: s_add_i32 s26, s24, 0x30000 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v24i16_to_v12i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24i16_to_v12i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i16> %a, splat (i16 3) + %a2 = bitcast <24 x i16> %a1 to <12 x i32> + br label %end + +cmp.false: + %a3 = bitcast <24 x i16> %a to <12 x i32> + br label %end + +end: + %phi = phi <12 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i32> %phi +} + define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i32_to_v24f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v35, v11 -; GCN-NEXT: v_mov_b32_e32 v34, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v9 -; GCN-NEXT: v_mov_b32_e32 v32, v8 -; GCN-NEXT: v_mov_b32_e32 v31, v7 -; GCN-NEXT: v_mov_b32_e32 v30, v6 -; GCN-NEXT: v_mov_b32_e32 v29, v5 -; GCN-NEXT: v_mov_b32_e32 v28, v4 -; GCN-NEXT: v_mov_b32_e32 v27, v3 -; GCN-NEXT: v_mov_b32_e32 v26, v2 -; GCN-NEXT: v_mov_b32_e32 v25, v1 -; GCN-NEXT: v_mov_b32_e32 v24, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i32_to_v24f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v11 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v33, v9 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: v_mov_b32_e32 v31, v7 +; SI-NEXT: v_mov_b32_e32 v30, v6 +; SI-NEXT: v_mov_b32_e32 v29, v5 +; SI-NEXT: v_mov_b32_e32 v28, v4 +; SI-NEXT: v_mov_b32_e32 v27, v3 +; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v25, v1 +; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i32_to_v24f16: ; VI: ; %bb.0: @@ -1268,7 +2750,7 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 @@ -1282,7 +2764,7 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: .LBB16_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1293,7 +2775,7 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 ; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 @@ -1307,7 +2789,7 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB8_2: ; %end +; GFX9-NEXT: .LBB16_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1319,7 +2801,7 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 ; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 @@ -1333,7 +2815,7 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB8_2: ; %end +; GFX11-NEXT: .LBB16_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1353,195 +2835,441 @@ end: ret <24 x half> %phi } +define inreg <24 x half> @bitcast_v12i32_to_v24f16_scalar(<12 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i32_to_v24f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v12i32_to_v24f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v12i32_to_v24f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v12i32_to_v24f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i32> %a, splat (i32 3) + %a2 = bitcast <12 x i32> %a1 to <24 x half> + br label %end + +cmp.false: + %a3 = bitcast <12 x i32> %a to <24 x half> + br label %end + +end: + %phi = phi <24 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x half> %phi +} + define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f16_to_v12i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v22 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v38 -; GCN-NEXT: v_or_b32_e32 v0, v39, v0 -; GCN-NEXT: v_or_b32_e32 v1, v37, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v15 -; GCN-NEXT: v_or_b32_e32 v2, v32, v2 -; GCN-NEXT: v_or_b32_e32 v3, v30, v3 -; GCN-NEXT: v_or_b32_e32 v4, v28, v4 -; GCN-NEXT: v_or_b32_e32 v5, v26, v5 -; GCN-NEXT: v_or_b32_e32 v6, v25, v6 -; GCN-NEXT: v_or_b32_e32 v7, v24, v7 -; GCN-NEXT: v_or_b32_e32 v8, v16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v14, v9 -; GCN-NEXT: v_or_b32_e32 v10, v13, v10 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v37 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v20, v18 -; GCN-NEXT: v_or_b32_e32 v8, v16, v21 -; GCN-NEXT: v_or_b32_e32 v9, v14, v19 -; GCN-NEXT: v_or_b32_e32 v10, v13, v17 -; GCN-NEXT: v_or_b32_e32 v11, v12, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f16_to_v12i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v22 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v33, v4 +; SI-NEXT: v_or_b32_e32 v5, v31, v5 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 +; SI-NEXT: v_or_b32_e32 v7, v27, v7 +; SI-NEXT: v_or_b32_e32 v8, v25, v8 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v31 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v26 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v12i32: ; VI: ; %bb.0: @@ -1550,7 +3278,7 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 0x200 ; VI-NEXT: v_add_f16_sdwa v13, v11, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1589,7 +3317,7 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v13 ; VI-NEXT: v_or_b32_e32 v0, v0, v12 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1600,7 +3328,7 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] @@ -1615,7 +3343,7 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: .LBB18_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1627,7 +3355,7 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] @@ -1641,7 +3369,7 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: .LBB18_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1661,31 +3389,371 @@ end: ret <12 x i32> %phi } +define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f16_to_v12i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_or_b32_e32 v4, v24, v4 +; SI-NEXT: v_or_b32_e32 v5, v25, v5 +; SI-NEXT: v_or_b32_e32 v6, v22, v6 +; SI-NEXT: v_or_b32_e32 v7, v20, v7 +; SI-NEXT: v_or_b32_e32 v8, v18, v8 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v24f16_to_v12i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s27, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v12, v1 +; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_add_f16_sdwa v12, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v12 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v24f16_to_v12i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24f16_to_v12i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x half> %a, splat (half 0xH0200) + %a2 = bitcast <24 x half> %a1 to <12 x i32> + br label %end + +cmp.false: + %a3 = bitcast <24 x half> %a to <12 x i32> + br label %end + +end: + %phi = phi <12 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i32> %phi +} + define <6 x double> @bitcast_v12f32_to_v6f64(<12 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f32_to_v6f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f32_to_v6f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v6f64: ; VI: ; %bb.0: @@ -1694,7 +3762,7 @@ define <6 x double> @bitcast_v12f32_to_v6f64(<12 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -1708,7 +3776,7 @@ define <6 x double> @bitcast_v12f32_to_v6f64(<12 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1719,7 +3787,7 @@ define <6 x double> @bitcast_v12f32_to_v6f64(<12 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -1733,7 +3801,7 @@ define <6 x double> @bitcast_v12f32_to_v6f64(<12 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1772,25 +3840,208 @@ end: ret <6 x double> %phi } +define inreg <6 x double> @bitcast_v12f32_to_v6f64_scalar(<12 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f32_to_v6f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_4 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_3: +; SI-NEXT: s_branch .LBB21_2 +; SI-NEXT: .LBB21_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f32_to_v6f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_4 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_3: +; VI-NEXT: s_branch .LBB21_2 +; VI-NEXT: .LBB21_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f32_to_v6f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_4 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_3: +; GFX9-NEXT: s_branch .LBB21_2 +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f32_to_v6f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: s_branch .LBB21_2 +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <12 x float> %a1 to <6 x double> + br label %end + +cmp.false: + %a3 = bitcast <12 x float> %a to <6 x double> + br label %end + +end: + %phi = phi <6 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x double> %phi +} + define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f64_to_v12f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f64_to_v12f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v12f32: ; VI: ; %bb.0: @@ -1799,7 +4050,7 @@ define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -1807,7 +4058,7 @@ define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1818,7 +4069,7 @@ define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -1826,7 +4077,7 @@ define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1838,7 +4089,7 @@ define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -1846,7 +4097,7 @@ define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1866,31 +4117,176 @@ end: ret <12 x float> %phi } +define inreg <12 x float> @bitcast_v6f64_to_v12f32_scalar(<6 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f64_to_v12f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB23_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_4 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_3: +; SI-NEXT: s_branch .LBB23_2 +; SI-NEXT: .LBB23_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f64_to_v12f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f64_to_v12f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f64_to_v12f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: s_branch .LBB23_2 +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <6 x double> %a1 to <12 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x double> %a to <12 x float> + br label %end + +end: + %phi = phi <12 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x float> %phi +} + define <6 x i64> @bitcast_v12f32_to_v6i64(<12 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f32_to_v6i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f32_to_v6i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v6i64: ; VI: ; %bb.0: @@ -1899,7 +4295,7 @@ define <6 x i64> @bitcast_v12f32_to_v6i64(<12 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -1913,7 +4309,7 @@ define <6 x i64> @bitcast_v12f32_to_v6i64(<12 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end +; VI-NEXT: .LBB24_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1924,7 +4320,7 @@ define <6 x i64> @bitcast_v12f32_to_v6i64(<12 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -1938,7 +4334,7 @@ define <6 x i64> @bitcast_v12f32_to_v6i64(<12 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: .LBB24_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1977,31 +4373,214 @@ end: ret <6 x i64> %phi } +define inreg <6 x i64> @bitcast_v12f32_to_v6i64_scalar(<12 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f32_to_v6i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB25_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_4 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_3: +; SI-NEXT: s_branch .LBB25_2 +; SI-NEXT: .LBB25_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f32_to_v6i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB25_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_4 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_3: +; VI-NEXT: s_branch .LBB25_2 +; VI-NEXT: .LBB25_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f32_to_v6i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_4 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_3: +; GFX9-NEXT: s_branch .LBB25_2 +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f32_to_v6i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: s_branch .LBB25_2 +; GFX11-NEXT: .LBB25_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <12 x float> %a1 to <6 x i64> + br label %end + +cmp.false: + %a3 = bitcast <12 x float> %a to <6 x i64> + br label %end + +end: + %phi = phi <6 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i64> %phi +} + define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i64_to_v12f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i64_to_v12f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i64_to_v12f32: ; VI: ; %bb.0: @@ -2010,7 +4589,7 @@ define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_cbranch_execz .LBB26_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc @@ -2024,7 +4603,7 @@ define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2035,7 +4614,7 @@ define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc @@ -2049,7 +4628,7 @@ define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2061,7 +4640,7 @@ define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -2078,7 +4657,7 @@ define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2098,82 +4677,244 @@ end: ret <12 x float> %phi } +define inreg <12 x float> @bitcast_v6i64_to_v12f32_scalar(<6 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i64_to_v12f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v6i64_to_v12f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v6i64_to_v12f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v6i64_to_v12f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_3 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB27_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: s_branch .LBB27_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i64> %a, splat (i64 3) + %a2 = bitcast <6 x i64> %a1 to <12 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x i64> %a to <12 x float> + br label %end + +end: + %phi = phi <12 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x float> %phi +} + define <24 x i16> @bitcast_v12f32_to_v24i16(<12 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f32_to_v24i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f32_to_v24i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v12, v24 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v24i16: ; VI: ; %bb.0: @@ -2182,7 +4923,7 @@ define <24 x i16> @bitcast_v12f32_to_v24i16(<12 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -2196,7 +4937,7 @@ define <24 x i16> @bitcast_v12f32_to_v24i16(<12 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB14_2: ; %end +; VI-NEXT: .LBB28_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2207,7 +4948,7 @@ define <24 x i16> @bitcast_v12f32_to_v24i16(<12 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -2221,7 +4962,7 @@ define <24 x i16> @bitcast_v12f32_to_v24i16(<12 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB14_2: ; %end +; GFX9-NEXT: .LBB28_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2260,142 +5001,369 @@ end: ret <24 x i16> %phi } +define inreg <24 x i16> @bitcast_v12f32_to_v24i16_scalar(<12 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f32_to_v24i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB29_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s11, s27, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB29_4 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f32_to_v24i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f32_to_v24i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f32_to_v24i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <12 x float> %a1 to <24 x i16> + br label %end + +cmp.false: + %a3 = bitcast <12 x float> %a to <24 x i16> + br label %end + +end: + %phi = phi <24 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i16> %phi +} + define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i16_to_v12f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v30, v10 -; GCN-NEXT: v_mov_b32_e32 v29, v8 -; GCN-NEXT: v_mov_b32_e32 v28, v6 -; GCN-NEXT: v_mov_b32_e32 v27, v4 -; GCN-NEXT: v_mov_b32_e32 v26, v2 -; GCN-NEXT: v_mov_b32_e32 v25, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_4 -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB15_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v0, v0, v34 -; GCN-NEXT: v_or_b32_e32 v1, v1, v35 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v2, v2, v24 -; GCN-NEXT: v_or_b32_e32 v3, v3, v31 -; GCN-NEXT: v_or_b32_e32 v4, v4, v32 -; GCN-NEXT: v_or_b32_e32 v5, v5, v33 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: .LBB15_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v25 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_or_b32_e32 v2, v24, v2 -; GCN-NEXT: v_or_b32_e32 v3, v31, v3 -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_or_b32_e32 v5, v33, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i16_to_v12f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v10 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v25, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v23 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_4 +; SI-NEXT: .LBB30_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB30_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v24 +; SI-NEXT: v_or_b32_e32 v9, v9, v17 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: .LBB30_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_or_b32_e32 v6, v32, v6 +; SI-NEXT: v_or_b32_e32 v7, v31, v7 +; SI-NEXT: v_or_b32_e32 v8, v24, v8 +; SI-NEXT: v_or_b32_e32 v9, v17, v9 +; SI-NEXT: v_or_b32_e32 v10, v15, v10 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i16_to_v12f32: ; VI: ; %bb.0: @@ -2404,7 +5372,7 @@ define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 3 ; VI-NEXT: v_add_u16_e32 v12, 3, v11 @@ -2443,7 +5411,7 @@ define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v12, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v12, v0 -; VI-NEXT: .LBB15_2: ; %end +; VI-NEXT: .LBB30_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2454,7 +5422,7 @@ define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -2468,7 +5436,7 @@ define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_2: ; %end +; GFX9-NEXT: .LBB30_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,7 +5448,7 @@ define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -2494,7 +5462,7 @@ define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB15_2: ; %end +; GFX11-NEXT: .LBB30_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2514,158 +5482,461 @@ end: ret <12 x float> %phi } +define inreg <12 x float> @bitcast_v24i16_to_v12f32_scalar(<24 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i16_to_v12f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v12, v8 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_mov_b32_e32 v14, v4 +; SI-NEXT: v_mov_b32_e32 v15, v2 +; SI-NEXT: v_mov_b32_e32 v16, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v9 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v10, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v20 +; SI-NEXT: v_or_b32_e32 v11, v0, v17 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v24i16_to_v12f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s26, 3 +; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s27, s26, 0x30000 +; VI-NEXT: s_add_i32 s26, s24, 0x30000 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v24i16_to_v12f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_4 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_3: +; GFX9-NEXT: s_branch .LBB31_2 +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24i16_to_v12f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_3: +; GFX11-NEXT: s_branch .LBB31_2 +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i16> %a, splat (i16 3) + %a2 = bitcast <24 x i16> %a1 to <12 x float> + br label %end + +cmp.false: + %a3 = bitcast <24 x i16> %a to <12 x float> + br label %end + +end: + %phi = phi <12 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x float> %phi +} + define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f32_to_v24f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v35, v11 -; GCN-NEXT: v_mov_b32_e32 v34, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v9 -; GCN-NEXT: v_mov_b32_e32 v32, v8 -; GCN-NEXT: v_mov_b32_e32 v31, v7 -; GCN-NEXT: v_mov_b32_e32 v30, v6 -; GCN-NEXT: v_mov_b32_e32 v29, v5 -; GCN-NEXT: v_mov_b32_e32 v28, v4 -; GCN-NEXT: v_mov_b32_e32 v27, v3 -; GCN-NEXT: v_mov_b32_e32 v26, v2 -; GCN-NEXT: v_mov_b32_e32 v25, v1 -; GCN-NEXT: v_mov_b32_e32 v24, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_4 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB16_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: .LBB16_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v31 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v32 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v33 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v34 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f32_to_v24f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v11 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v33, v9 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: v_mov_b32_e32 v31, v7 +; SI-NEXT: v_mov_b32_e32 v30, v6 +; SI-NEXT: v_mov_b32_e32 v29, v5 +; SI-NEXT: v_mov_b32_e32 v28, v4 +; SI-NEXT: v_mov_b32_e32 v27, v3 +; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v25, v1 +; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_4 +; SI-NEXT: .LBB32_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB32_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: .LBB32_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v34 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v24f16: ; VI: ; %bb.0: @@ -2674,7 +5945,7 @@ define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -2688,7 +5959,7 @@ define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB16_2: ; %end +; VI-NEXT: .LBB32_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2699,7 +5970,7 @@ define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -2713,7 +5984,7 @@ define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB16_2: ; %end +; GFX9-NEXT: .LBB32_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2752,195 +6023,457 @@ end: ret <24 x half> %phi } +define inreg <24 x half> @bitcast_v12f32_to_v24f16_scalar(<12 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f32_to_v24f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s27, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v12f32_to_v24f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_3: +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f32_to_v24f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f32_to_v24f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <12 x float> %a1 to <24 x half> + br label %end + +cmp.false: + %a3 = bitcast <12 x float> %a to <24 x half> + br label %end + +end: + %phi = phi <24 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x half> %phi +} + define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f16_to_v12f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v22 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB17_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB17_4 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB17_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v38 -; GCN-NEXT: v_or_b32_e32 v0, v39, v0 -; GCN-NEXT: v_or_b32_e32 v1, v37, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v15 -; GCN-NEXT: v_or_b32_e32 v2, v32, v2 -; GCN-NEXT: v_or_b32_e32 v3, v30, v3 -; GCN-NEXT: v_or_b32_e32 v4, v28, v4 -; GCN-NEXT: v_or_b32_e32 v5, v26, v5 -; GCN-NEXT: v_or_b32_e32 v6, v25, v6 -; GCN-NEXT: v_or_b32_e32 v7, v24, v7 -; GCN-NEXT: v_or_b32_e32 v8, v16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v14, v9 -; GCN-NEXT: v_or_b32_e32 v10, v13, v10 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: .LBB17_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v37 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v20, v18 -; GCN-NEXT: v_or_b32_e32 v8, v16, v21 -; GCN-NEXT: v_or_b32_e32 v9, v14, v19 -; GCN-NEXT: v_or_b32_e32 v10, v13, v17 -; GCN-NEXT: v_or_b32_e32 v11, v12, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f16_to_v12f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v22 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB34_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB34_4 +; SI-NEXT: .LBB34_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB34_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v33, v4 +; SI-NEXT: v_or_b32_e32 v5, v31, v5 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 +; SI-NEXT: v_or_b32_e32 v7, v27, v7 +; SI-NEXT: v_or_b32_e32 v8, v25, v8 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: .LBB34_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v31 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v26 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v12f32: ; VI: ; %bb.0: @@ -2949,7 +6482,7 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 0x200 ; VI-NEXT: v_add_f16_sdwa v13, v11, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -2988,7 +6521,7 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v13 ; VI-NEXT: v_or_b32_e32 v0, v0, v12 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2999,7 +6532,7 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] @@ -3014,7 +6547,7 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: .LBB34_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3026,7 +6559,7 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] @@ -3040,7 +6573,7 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: .LBB34_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3060,25 +6593,365 @@ end: ret <12 x float> %phi } +define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f16_to_v12f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_or_b32_e32 v4, v24, v4 +; SI-NEXT: v_or_b32_e32 v5, v25, v5 +; SI-NEXT: v_or_b32_e32 v6, v22, v6 +; SI-NEXT: v_or_b32_e32 v7, v20, v7 +; SI-NEXT: v_or_b32_e32 v8, v18, v8 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v24f16_to_v12f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB35_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s27, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v12, v1 +; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_add_f16_sdwa v12, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v12 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: +; VI-NEXT: s_branch .LBB35_2 +; VI-NEXT: .LBB35_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v24f16_to_v12f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: +; GFX9-NEXT: s_branch .LBB35_2 +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24f16_to_v12f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x half> %a, splat (half 0xH0200) + %a2 = bitcast <24 x half> %a1 to <12 x float> + br label %end + +cmp.false: + %a3 = bitcast <24 x half> %a to <12 x float> + br label %end + +end: + %phi = phi <12 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x float> %phi +} + define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f64_to_v6i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f64_to_v6i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v6i64: ; VI: ; %bb.0: @@ -3087,7 +6960,7 @@ define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: s_cbranch_execz .LBB36_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -3095,7 +6968,7 @@ define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: .LBB18_2: ; %end +; VI-NEXT: .LBB36_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3106,7 +6979,7 @@ define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: s_cbranch_execz .LBB36_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -3114,7 +6987,7 @@ define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX9-NEXT: .LBB18_2: ; %end +; GFX9-NEXT: .LBB36_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3126,7 +6999,7 @@ define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -3134,7 +7007,7 @@ define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: .LBB18_2: ; %end +; GFX11-NEXT: .LBB36_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3154,31 +7027,190 @@ end: ret <6 x i64> %phi } +define inreg <6 x i64> @bitcast_v6f64_to_v6i64_scalar(<6 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f64_to_v6i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB37_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_4 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_3: +; SI-NEXT: s_branch .LBB37_2 +; SI-NEXT: .LBB37_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f64_to_v6i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB37_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_4 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_3: +; VI-NEXT: s_branch .LBB37_2 +; VI-NEXT: .LBB37_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f64_to_v6i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_4 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_3: +; GFX9-NEXT: s_branch .LBB37_2 +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f64_to_v6i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: s_branch .LBB37_2 +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <6 x double> %a1 to <6 x i64> + br label %end + +cmp.false: + %a3 = bitcast <6 x double> %a to <6 x i64> + br label %end + +end: + %phi = phi <6 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i64> %phi +} + define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i64_to_v6f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i64_to_v6f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i64_to_v6f64: ; VI: ; %bb.0: @@ -3187,7 +7219,7 @@ define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -3201,7 +7233,7 @@ define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3212,7 +7244,7 @@ define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -3226,7 +7258,7 @@ define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: .LBB19_2: ; %end +; GFX9-NEXT: .LBB38_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3238,7 +7270,7 @@ define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -3255,7 +7287,7 @@ define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo ; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3275,86 +7307,247 @@ end: ret <6 x double> %phi } +define inreg <6 x double> @bitcast_v6i64_to_v6f64_scalar(<6 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i64_to_v6f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v6i64_to_v6f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v6i64_to_v6f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v6i64_to_v6f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_3 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: .LBB39_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: s_branch .LBB39_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i64> %a, splat (i64 3) + %a2 = bitcast <6 x i64> %a1 to <6 x double> + br label %end + +cmp.false: + %a3 = bitcast <6 x i64> %a to <6 x double> + br label %end + +end: + %phi = phi <6 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x double> %phi +} + define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f64_to_v24i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v34, v11 -; GCN-NEXT: v_mov_b32_e32 v33, v10 -; GCN-NEXT: v_mov_b32_e32 v32, v9 -; GCN-NEXT: v_mov_b32_e32 v31, v8 -; GCN-NEXT: v_mov_b32_e32 v30, v7 -; GCN-NEXT: v_mov_b32_e32 v29, v6 -; GCN-NEXT: v_mov_b32_e32 v28, v5 -; GCN-NEXT: v_mov_b32_e32 v27, v4 -; GCN-NEXT: v_mov_b32_e32 v26, v3 -; GCN-NEXT: v_mov_b32_e32 v25, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, v34, v33, 16 -; GCN-NEXT: v_alignbit_b32 v17, v32, v31, 16 -; GCN-NEXT: v_alignbit_b32 v13, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v9, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v5, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v24, v1, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; GCN-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; GCN-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 -; GCN-NEXT: v_alignbit_b32 v21, v34, v33, 16 -; GCN-NEXT: v_alignbit_b32 v17, v32, v31, 16 -; GCN-NEXT: v_alignbit_b32 v13, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v9, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v5, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v24, v1, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v25 -; GCN-NEXT: v_mov_b32_e32 v6, v26 -; GCN-NEXT: v_mov_b32_e32 v8, v27 -; GCN-NEXT: v_mov_b32_e32 v10, v28 -; GCN-NEXT: v_mov_b32_e32 v12, v29 -; GCN-NEXT: v_mov_b32_e32 v14, v30 -; GCN-NEXT: v_mov_b32_e32 v16, v31 -; GCN-NEXT: v_mov_b32_e32 v18, v32 -; GCN-NEXT: v_mov_b32_e32 v20, v33 -; GCN-NEXT: v_mov_b32_e32 v22, v34 -; GCN-NEXT: v_mov_b32_e32 v1, v24 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f64_to_v24i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v31, v8 +; SI-NEXT: v_mov_b32_e32 v30, v7 +; SI-NEXT: v_mov_b32_e32 v29, v6 +; SI-NEXT: v_mov_b32_e32 v28, v5 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v25, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v34, v33, 16 +; SI-NEXT: v_alignbit_b32 v17, v32, v31, 16 +; SI-NEXT: v_alignbit_b32 v13, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v9, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v5, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_alignbit_b32 v21, v34, v33, 16 +; SI-NEXT: v_alignbit_b32 v17, v32, v31, 16 +; SI-NEXT: v_alignbit_b32 v13, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v9, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v5, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v4, v25 +; SI-NEXT: v_mov_b32_e32 v6, v26 +; SI-NEXT: v_mov_b32_e32 v8, v27 +; SI-NEXT: v_mov_b32_e32 v10, v28 +; SI-NEXT: v_mov_b32_e32 v12, v29 +; SI-NEXT: v_mov_b32_e32 v14, v30 +; SI-NEXT: v_mov_b32_e32 v16, v31 +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: v_mov_b32_e32 v20, v33 +; SI-NEXT: v_mov_b32_e32 v22, v34 +; SI-NEXT: v_mov_b32_e32 v1, v24 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v24i16: ; VI: ; %bb.0: @@ -3363,7 +7556,7 @@ define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -3371,7 +7564,7 @@ define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: .LBB40_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3382,7 +7575,7 @@ define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -3390,7 +7583,7 @@ define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: .LBB40_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3402,7 +7595,7 @@ define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -3410,7 +7603,7 @@ define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: .LBB40_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3430,144 +7623,360 @@ end: ret <24 x i16> %phi } +define inreg <24 x i16> @bitcast_v6f64_to_v24i16_scalar(<6 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f64_to_v24i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB41_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v24, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v25, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v27, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v28, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v29, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s11, s27, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_4 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[16:17], s[24:25], 1.0 +; SI-NEXT: v_alignbit_b32 v24, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v25, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v28, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v29, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: s_branch .LBB41_5 +; SI-NEXT: .LBB41_3: +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: s_branch .LBB41_2 +; SI-NEXT: .LBB41_4: +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v5, s19 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v21, s27 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: .LBB41_5: ; %end +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v6, v5 +; SI-NEXT: v_mov_b32_e32 v10, v9 +; SI-NEXT: v_mov_b32_e32 v14, v13 +; SI-NEXT: v_mov_b32_e32 v18, v17 +; SI-NEXT: v_mov_b32_e32 v22, v21 +; SI-NEXT: v_mov_b32_e32 v1, v29 +; SI-NEXT: v_mov_b32_e32 v5, v28 +; SI-NEXT: v_mov_b32_e32 v9, v27 +; SI-NEXT: v_mov_b32_e32 v13, v26 +; SI-NEXT: v_mov_b32_e32 v17, v25 +; SI-NEXT: v_mov_b32_e32 v21, v24 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f64_to_v24i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB41_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_4 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_3: +; VI-NEXT: s_branch .LBB41_2 +; VI-NEXT: .LBB41_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f64_to_v24i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_4 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_3: +; GFX9-NEXT: s_branch .LBB41_2 +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f64_to_v24i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_4 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_3: +; GFX11-NEXT: s_branch .LBB41_2 +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <6 x double> %a1 to <24 x i16> + br label %end + +cmp.false: + %a3 = bitcast <6 x double> %a to <24 x i16> + br label %end + +end: + %phi = phi <24 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i16> %phi +} + define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i16_to_v6f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v32, v14 -; GCN-NEXT: v_mov_b32_e32 v31, v12 -; GCN-NEXT: v_mov_b32_e32 v30, v10 -; GCN-NEXT: v_mov_b32_e32 v29, v8 -; GCN-NEXT: v_mov_b32_e32 v28, v6 -; GCN-NEXT: v_mov_b32_e32 v25, v4 -; GCN-NEXT: v_mov_b32_e32 v26, v2 -; GCN-NEXT: v_mov_b32_e32 v27, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_4 -; GCN-NEXT: .LBB21_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB21_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v0, v0, v38 -; GCN-NEXT: v_or_b32_e32 v1, v1, v39 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v2, v2, v24 -; GCN-NEXT: v_or_b32_e32 v3, v3, v33 -; GCN-NEXT: v_or_b32_e32 v4, v4, v34 -; GCN-NEXT: v_or_b32_e32 v5, v5, v35 -; GCN-NEXT: v_or_b32_e32 v6, v6, v36 -; GCN-NEXT: v_or_b32_e32 v7, v7, v37 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: .LBB21_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v27 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v0, v38, v0 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_or_b32_e32 v2, v24, v2 -; GCN-NEXT: v_or_b32_e32 v3, v33, v3 -; GCN-NEXT: v_or_b32_e32 v4, v34, v4 -; GCN-NEXT: v_or_b32_e32 v5, v35, v5 -; GCN-NEXT: v_or_b32_e32 v6, v36, v6 -; GCN-NEXT: v_or_b32_e32 v7, v37, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i16_to_v6f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: v_mov_b32_e32 v31, v12 +; SI-NEXT: v_mov_b32_e32 v30, v10 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_4 +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB42_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_or_b32_e32 v6, v6, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v34 +; SI-NEXT: v_or_b32_e32 v8, v8, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v17 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: .LBB42_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v34, v7 +; SI-NEXT: v_or_b32_e32 v8, v33, v8 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v10, v19, v10 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i16_to_v6f64: ; VI: ; %bb.0: @@ -3576,7 +7985,7 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 3 ; VI-NEXT: v_add_u16_e32 v12, 3, v11 @@ -3615,7 +8024,7 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v12, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v12, v0 -; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: .LBB42_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3626,7 +8035,7 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -3640,7 +8049,7 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_2: ; %end +; GFX9-NEXT: .LBB42_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3652,7 +8061,7 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -3666,7 +8075,7 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB21_2: ; %end +; GFX11-NEXT: .LBB42_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3686,142 +8095,451 @@ end: ret <6 x double> %phi } +define inreg <6 x double> @bitcast_v24i16_to_v6f64_scalar(<24 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i16_to_v6f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v20, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v10, v0, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v24 +; SI-NEXT: v_or_b32_e32 v11, v0, v21 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v24i16_to_v6f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s26, 3 +; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s27, s26, 0x30000 +; VI-NEXT: s_add_i32 s26, s24, 0x30000 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v24i16_to_v6f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_4 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_3: +; GFX9-NEXT: s_branch .LBB43_2 +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24i16_to_v6f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: +; GFX11-NEXT: s_branch .LBB43_2 +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i16> %a, splat (i16 3) + %a2 = bitcast <24 x i16> %a1 to <6 x double> + br label %end + +cmp.false: + %a3 = bitcast <24 x i16> %a to <6 x double> + br label %end + +end: + %phi = phi <6 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x double> %phi +} + define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f64_to_v24f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v29 -; GCN-NEXT: v_mov_b32_e32 v1, v35 -; GCN-NEXT: v_mov_b32_e32 v2, v24 -; GCN-NEXT: v_mov_b32_e32 v3, v34 -; GCN-NEXT: v_mov_b32_e32 v4, v25 -; GCN-NEXT: v_mov_b32_e32 v5, v33 -; GCN-NEXT: v_mov_b32_e32 v6, v26 -; GCN-NEXT: v_mov_b32_e32 v7, v32 -; GCN-NEXT: v_mov_b32_e32 v8, v27 -; GCN-NEXT: v_mov_b32_e32 v9, v31 -; GCN-NEXT: v_mov_b32_e32 v10, v28 -; GCN-NEXT: v_mov_b32_e32 v11, v30 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f64_to_v24f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v34 +; SI-NEXT: v_mov_b32_e32 v1, v35 +; SI-NEXT: v_mov_b32_e32 v2, v33 +; SI-NEXT: v_mov_b32_e32 v3, v32 +; SI-NEXT: v_mov_b32_e32 v4, v31 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: v_mov_b32_e32 v6, v30 +; SI-NEXT: v_mov_b32_e32 v7, v27 +; SI-NEXT: v_mov_b32_e32 v8, v28 +; SI-NEXT: v_mov_b32_e32 v9, v24 +; SI-NEXT: v_mov_b32_e32 v10, v26 +; SI-NEXT: v_mov_b32_e32 v11, v25 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v24f16: ; VI: ; %bb.0: @@ -3830,7 +8548,7 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -3838,7 +8556,7 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: .LBB44_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3849,7 +8567,7 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -3857,28 +8575,266 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: .LBB44_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v6f64_to_v24f16: +; GFX11-LABEL: bitcast_v6f64_to_v24f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v12 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB44_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <6 x double> %a1 to <24 x half> + br label %end + +cmp.false: + %a3 = bitcast <6 x double> %a to <24 x half> + br label %end + +end: + %phi = phi <24 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x half> %phi +} + +define inreg <24 x half> @bitcast_v6f64_to_v24f16_scalar(<6 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f64_to_v24f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v6f64_to_v24f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB45_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_4 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_3: +; VI-NEXT: s_branch .LBB45_2 +; VI-NEXT: .LBB45_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f64_to_v24f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_4 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_3: +; GFX9-NEXT: s_branch .LBB45_2 +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f64_to_v24f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v12 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB22_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_4 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_3: +; GFX11-NEXT: s_branch .LBB45_2 +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -3898,194 +8854,194 @@ end: } define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f16_to_v6f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v22 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_4 -; GCN-NEXT: .LBB23_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB23_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v48 -; GCN-NEXT: v_or_b32_e32 v0, v49, v0 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v19 -; GCN-NEXT: v_or_b32_e32 v2, v34, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v30, v4 -; GCN-NEXT: v_or_b32_e32 v5, v28, v5 -; GCN-NEXT: v_or_b32_e32 v6, v27, v6 -; GCN-NEXT: v_or_b32_e32 v7, v25, v7 -; GCN-NEXT: v_or_b32_e32 v8, v24, v8 -; GCN-NEXT: v_or_b32_e32 v9, v18, v9 -; GCN-NEXT: v_or_b32_e32 v10, v17, v10 -; GCN-NEXT: v_or_b32_e32 v11, v16, v11 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: .LBB23_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v39 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v18, v20 -; GCN-NEXT: v_or_b32_e32 v10, v17, v21 -; GCN-NEXT: v_or_b32_e32 v11, v16, v19 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f16_to_v6f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v22 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_4 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB46_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_or_b32_e32 v6, v31, v6 +; SI-NEXT: v_or_b32_e32 v7, v29, v7 +; SI-NEXT: v_or_b32_e32 v8, v27, v8 +; SI-NEXT: v_or_b32_e32 v9, v25, v9 +; SI-NEXT: v_or_b32_e32 v10, v18, v10 +; SI-NEXT: v_or_b32_e32 v11, v16, v11 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: .LBB46_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v6f64: ; VI: ; %bb.0: @@ -4094,7 +9050,7 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 0x200 ; VI-NEXT: v_add_f16_sdwa v13, v11, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -4133,7 +9089,7 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v13 ; VI-NEXT: v_or_b32_e32 v0, v0, v12 -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4144,7 +9100,7 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] @@ -4159,7 +9115,7 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: .LBB46_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4171,7 +9127,7 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] @@ -4185,7 +9141,7 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB23_2: ; %end +; GFX11-NEXT: .LBB46_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4205,82 +9161,432 @@ end: ret <6 x double> %phi } +define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f16_to_v6f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v39, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v28, v4 +; SI-NEXT: v_or_b32_e32 v5, v29, v5 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_or_b32_e32 v7, v24, v7 +; SI-NEXT: v_or_b32_e32 v8, v22, v8 +; SI-NEXT: v_or_b32_e32 v9, v20, v9 +; SI-NEXT: v_or_b32_e32 v10, v18, v10 +; SI-NEXT: v_or_b32_e32 v11, v16, v11 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v24f16_to_v6f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s27, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v12, v1 +; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_add_f16_sdwa v12, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v12 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v24f16_to_v6f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24f16_to_v6f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_3: +; GFX11-NEXT: s_branch .LBB47_2 +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x half> %a, splat (half 0xH0200) + %a2 = bitcast <24 x half> %a1 to <6 x double> + br label %end + +cmp.false: + %a3 = bitcast <24 x half> %a to <6 x double> + br label %end + +end: + %phi = phi <6 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x double> %phi +} + define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i64_to_v24i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i64_to_v24i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v12, v24 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i64_to_v24i16: ; VI: ; %bb.0: @@ -4289,7 +9595,7 @@ define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc @@ -4303,7 +9609,7 @@ define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: .LBB48_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4314,7 +9620,7 @@ define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc @@ -4328,7 +9634,7 @@ define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: .LBB48_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4340,7 +9646,7 @@ define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -4357,7 +9663,7 @@ define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: .LBB48_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4377,144 +9683,360 @@ end: ret <24 x i16> %phi } +define inreg <24 x i16> @bitcast_v6i64_to_v24i16_scalar(<6 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i64_to_v24i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s8 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s7 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v23, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v6i64_to_v24i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v6i64_to_v24i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: s_branch .LBB49_2 +; +; GFX11-LABEL: bitcast_v6i64_to_v24i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-NEXT: .LBB49_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB49_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB49_4: +; GFX11-NEXT: s_branch .LBB49_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i64> %a, splat (i64 3) + %a2 = bitcast <6 x i64> %a1 to <24 x i16> + br label %end + +cmp.false: + %a3 = bitcast <6 x i64> %a to <24 x i16> + br label %end + +end: + %phi = phi <24 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i16> %phi +} + define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i16_to_v6i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v32, v14 -; GCN-NEXT: v_mov_b32_e32 v31, v12 -; GCN-NEXT: v_mov_b32_e32 v30, v10 -; GCN-NEXT: v_mov_b32_e32 v29, v8 -; GCN-NEXT: v_mov_b32_e32 v28, v6 -; GCN-NEXT: v_mov_b32_e32 v25, v4 -; GCN-NEXT: v_mov_b32_e32 v26, v2 -; GCN-NEXT: v_mov_b32_e32 v27, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_4 -; GCN-NEXT: .LBB25_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB25_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v0, v0, v38 -; GCN-NEXT: v_or_b32_e32 v1, v1, v39 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v2, v2, v24 -; GCN-NEXT: v_or_b32_e32 v3, v3, v33 -; GCN-NEXT: v_or_b32_e32 v4, v4, v34 -; GCN-NEXT: v_or_b32_e32 v5, v5, v35 -; GCN-NEXT: v_or_b32_e32 v6, v6, v36 -; GCN-NEXT: v_or_b32_e32 v7, v7, v37 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: .LBB25_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v27 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v0, v38, v0 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_or_b32_e32 v2, v24, v2 -; GCN-NEXT: v_or_b32_e32 v3, v33, v3 -; GCN-NEXT: v_or_b32_e32 v4, v34, v4 -; GCN-NEXT: v_or_b32_e32 v5, v35, v5 -; GCN-NEXT: v_or_b32_e32 v6, v36, v6 -; GCN-NEXT: v_or_b32_e32 v7, v37, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i16_to_v6i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: v_mov_b32_e32 v31, v12 +; SI-NEXT: v_mov_b32_e32 v30, v10 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_4 +; SI-NEXT: .LBB50_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB50_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_or_b32_e32 v6, v6, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v34 +; SI-NEXT: v_or_b32_e32 v8, v8, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v17 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: .LBB50_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v34, v7 +; SI-NEXT: v_or_b32_e32 v8, v33, v8 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v10, v19, v10 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i16_to_v6i64: ; VI: ; %bb.0: @@ -4523,7 +10045,7 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 3 ; VI-NEXT: v_add_u16_e32 v12, 3, v11 @@ -4562,7 +10084,7 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v12, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v12, v0 -; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4573,7 +10095,7 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -4587,7 +10109,7 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: .LBB50_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4599,7 +10121,7 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -4613,7 +10135,7 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB25_2: ; %end +; GFX11-NEXT: .LBB50_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4633,158 +10155,467 @@ end: ret <6 x i64> %phi } +define inreg <6 x i64> @bitcast_v24i16_to_v6i64_scalar(<24 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i16_to_v6i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v20, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v10, v0, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v24 +; SI-NEXT: v_or_b32_e32 v11, v0, v21 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v24i16_to_v6i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s26, 3 +; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s27, s26, 0x30000 +; VI-NEXT: s_add_i32 s26, s24, 0x30000 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v24i16_to_v6i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB51_4 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_3: +; GFX9-NEXT: s_branch .LBB51_2 +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24i16_to_v6i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB51_4 +; GFX11-NEXT: .LBB51_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB51_3: +; GFX11-NEXT: s_branch .LBB51_2 +; GFX11-NEXT: .LBB51_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i16> %a, splat (i16 3) + %a2 = bitcast <24 x i16> %a1 to <6 x i64> + br label %end + +cmp.false: + %a3 = bitcast <24 x i16> %a to <6 x i64> + br label %end + +end: + %phi = phi <6 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i64> %phi +} + define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i64_to_v24f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v26, v11 -; GCN-NEXT: v_mov_b32_e32 v25, v10 -; GCN-NEXT: v_mov_b32_e32 v28, v9 -; GCN-NEXT: v_mov_b32_e32 v27, v8 -; GCN-NEXT: v_mov_b32_e32 v30, v7 -; GCN-NEXT: v_mov_b32_e32 v29, v6 -; GCN-NEXT: v_mov_b32_e32 v32, v5 -; GCN-NEXT: v_mov_b32_e32 v31, v4 -; GCN-NEXT: v_mov_b32_e32 v34, v3 -; GCN-NEXT: v_mov_b32_e32 v33, v2 -; GCN-NEXT: v_mov_b32_e32 v35, v1 -; GCN-NEXT: v_mov_b32_e32 v24, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB26_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB26_4 -; GCN-NEXT: .LBB26_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB26_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: .LBB26_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v35, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v34, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v31 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v32, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v29 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v30, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v26, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i64_to_v24f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v27, v9 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v29, v7 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v31, v5 +; SI-NEXT: v_mov_b32_e32 v30, v4 +; SI-NEXT: v_mov_b32_e32 v33, v3 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v35, v1 +; SI-NEXT: v_mov_b32_e32 v34, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB52_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB52_4 +; SI-NEXT: .LBB52_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB52_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: .LBB52_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v35, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v33, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v30 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v31, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v25, vcc +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i64_to_v24f16: ; VI: ; %bb.0: @@ -4793,7 +10624,7 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc @@ -4807,7 +10638,7 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: .LBB52_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4818,7 +10649,7 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc @@ -4832,7 +10663,7 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: .LBB52_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4844,7 +10675,7 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -4861,7 +10692,7 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: .LBB52_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4881,195 +10712,441 @@ end: ret <24 x half> %phi } +define inreg <24 x half> @bitcast_v6i64_to_v24f16_scalar(<6 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i64_to_v24f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_add_u32 s8, s18, 3 +; SI-NEXT: s_addc_u32 s9, s19, 0 +; SI-NEXT: s_lshr_b32 s10, s8, 16 +; SI-NEXT: s_lshr_b32 s11, s9, 16 +; SI-NEXT: s_add_u32 s12, s20, 3 +; SI-NEXT: s_addc_u32 s13, s21, 0 +; SI-NEXT: s_lshr_b32 s14, s12, 16 +; SI-NEXT: s_lshr_b32 s15, s13, 16 +; SI-NEXT: s_add_u32 s16, s22, 3 +; SI-NEXT: s_addc_u32 s17, s23, 0 +; SI-NEXT: s_lshr_b32 s18, s16, 16 +; SI-NEXT: s_lshr_b32 s19, s17, 16 +; SI-NEXT: s_add_u32 s20, s24, 3 +; SI-NEXT: s_addc_u32 s21, s25, 0 +; SI-NEXT: s_lshr_b32 s22, s20, 16 +; SI-NEXT: s_lshr_b32 s23, s21, 16 +; SI-NEXT: s_add_u32 s24, s26, 3 +; SI-NEXT: s_addc_u32 s25, s27, 0 +; SI-NEXT: s_lshr_b32 s26, s24, 16 +; SI-NEXT: s_lshr_b32 s27, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v6i64_to_v24f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v6i64_to_v24f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-LABEL: bitcast_v6i64_to_v24f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB53_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i64> %a, splat (i64 3) + %a2 = bitcast <6 x i64> %a1 to <24 x half> + br label %end + +cmp.false: + %a3 = bitcast <6 x i64> %a to <24 x half> + br label %end + +end: + %phi = phi <24 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x half> %phi +} + define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f16_to_v6i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v22 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB27_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB27_4 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB27_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v48 -; GCN-NEXT: v_or_b32_e32 v0, v49, v0 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v19 -; GCN-NEXT: v_or_b32_e32 v2, v34, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v30, v4 -; GCN-NEXT: v_or_b32_e32 v5, v28, v5 -; GCN-NEXT: v_or_b32_e32 v6, v27, v6 -; GCN-NEXT: v_or_b32_e32 v7, v25, v7 -; GCN-NEXT: v_or_b32_e32 v8, v24, v8 -; GCN-NEXT: v_or_b32_e32 v9, v18, v9 -; GCN-NEXT: v_or_b32_e32 v10, v17, v10 -; GCN-NEXT: v_or_b32_e32 v11, v16, v11 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: .LBB27_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v39 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v18, v20 -; GCN-NEXT: v_or_b32_e32 v10, v17, v21 -; GCN-NEXT: v_or_b32_e32 v11, v16, v19 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f16_to_v6i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v22 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_4 +; SI-NEXT: .LBB54_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB54_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_or_b32_e32 v6, v31, v6 +; SI-NEXT: v_or_b32_e32 v7, v29, v7 +; SI-NEXT: v_or_b32_e32 v8, v27, v8 +; SI-NEXT: v_or_b32_e32 v9, v25, v9 +; SI-NEXT: v_or_b32_e32 v10, v18, v10 +; SI-NEXT: v_or_b32_e32 v11, v16, v11 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: .LBB54_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v6i64: ; VI: ; %bb.0: @@ -5078,7 +11155,7 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 0x200 ; VI-NEXT: v_add_f16_sdwa v13, v11, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -5117,7 +11194,7 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v13 ; VI-NEXT: v_or_b32_e32 v0, v0, v12 -; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5128,7 +11205,7 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] @@ -5143,7 +11220,7 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_2: ; %end +; GFX9-NEXT: .LBB54_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5155,7 +11232,7 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] @@ -5169,7 +11246,7 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB27_2: ; %end +; GFX11-NEXT: .LBB54_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5189,170 +11266,520 @@ end: ret <6 x i64> %phi } +define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f16_to_v6i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v39, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v28, v4 +; SI-NEXT: v_or_b32_e32 v5, v29, v5 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_or_b32_e32 v7, v24, v7 +; SI-NEXT: v_or_b32_e32 v8, v22, v8 +; SI-NEXT: v_or_b32_e32 v9, v20, v9 +; SI-NEXT: v_or_b32_e32 v10, v18, v10 +; SI-NEXT: v_or_b32_e32 v11, v16, v11 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v24f16_to_v6i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB55_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_4 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s27, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v12, v1 +; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_add_f16_sdwa v12, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v12 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_3: +; VI-NEXT: s_branch .LBB55_2 +; VI-NEXT: .LBB55_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v24f16_to_v6i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: +; GFX9-NEXT: s_branch .LBB55_2 +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24f16_to_v6i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x half> %a, splat (half 0xH0200) + %a2 = bitcast <24 x half> %a1 to <6 x i64> + br label %end + +cmp.false: + %a3 = bitcast <24 x half> %a to <6 x i64> + br label %end + +end: + %phi = phi <6 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i64> %phi +} + define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i16_to_v24f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v55, v23 -; GCN-NEXT: v_mov_b32_e32 v54, v22 -; GCN-NEXT: v_mov_b32_e32 v53, v21 -; GCN-NEXT: v_mov_b32_e32 v52, v20 -; GCN-NEXT: v_mov_b32_e32 v51, v19 -; GCN-NEXT: v_mov_b32_e32 v50, v18 -; GCN-NEXT: v_mov_b32_e32 v49, v17 -; GCN-NEXT: v_mov_b32_e32 v48, v16 -; GCN-NEXT: v_mov_b32_e32 v39, v15 -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v13 -; GCN-NEXT: v_mov_b32_e32 v36, v12 -; GCN-NEXT: v_mov_b32_e32 v35, v11 -; GCN-NEXT: v_mov_b32_e32 v34, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v9 -; GCN-NEXT: v_mov_b32_e32 v32, v8 -; GCN-NEXT: v_mov_b32_e32 v31, v7 -; GCN-NEXT: v_mov_b32_e32 v30, v6 -; GCN-NEXT: v_mov_b32_e32 v29, v5 -; GCN-NEXT: v_mov_b32_e32 v28, v4 -; GCN-NEXT: v_mov_b32_e32 v27, v3 -; GCN-NEXT: v_mov_b32_e32 v26, v2 -; GCN-NEXT: v_mov_b32_e32 v25, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v40, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i16_to_v24f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v23 +; SI-NEXT: v_mov_b32_e32 v54, v22 +; SI-NEXT: v_mov_b32_e32 v53, v21 +; SI-NEXT: v_mov_b32_e32 v52, v20 +; SI-NEXT: v_mov_b32_e32 v51, v19 +; SI-NEXT: v_mov_b32_e32 v50, v18 +; SI-NEXT: v_mov_b32_e32 v49, v17 +; SI-NEXT: v_mov_b32_e32 v48, v16 +; SI-NEXT: v_mov_b32_e32 v39, v15 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v37, v13 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v35, v11 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v33, v9 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: v_mov_b32_e32 v31, v7 +; SI-NEXT: v_mov_b32_e32 v30, v6 +; SI-NEXT: v_mov_b32_e32 v29, v5 +; SI-NEXT: v_mov_b32_e32 v28, v4 +; SI-NEXT: v_mov_b32_e32 v27, v3 +; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v25, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i16_to_v24f16: ; VI: ; %bb.0: @@ -5361,7 +11788,7 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 3 ; VI-NEXT: v_add_u16_sdwa v19, v11, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -5400,7 +11827,7 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v15 ; VI-NEXT: v_or_b32_e32 v1, v1, v14 ; VI-NEXT: v_or_b32_e32 v0, v0, v13 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5411,7 +11838,7 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -5425,7 +11852,7 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5437,7 +11864,7 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -5451,7 +11878,7 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB28_2: ; %end +; GFX11-NEXT: .LBB56_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5471,145 +11898,449 @@ end: ret <24 x half> %phi } +define inreg <24 x half> @bitcast_v24i16_to_v24f16_scalar(<24 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i16_to_v24f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v30, v9 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_mov_b32_e32 v27, v6 +; SI-NEXT: v_mov_b32_e32 v26, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v24, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v31 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v24i16_to_v24f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s26, 3 +; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s27, s26, 0x30000 +; VI-NEXT: s_add_i32 s26, s24, 0x30000 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v24i16_to_v24f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24i16_to_v24f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_3: +; GFX11-NEXT: s_branch .LBB57_2 +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i16> %a, splat (i16 3) + %a2 = bitcast <24 x i16> %a1 to <24 x half> + br label %end + +cmp.false: + %a3 = bitcast <24 x i16> %a to <24 x half> + br label %end + +end: + %phi = phi <24 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x half> %phi +} + define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f16_to_v24i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v7 -; GCN-NEXT: v_or_b32_e32 v6, v6, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_or_b32_e32 v20, v20, v21 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f16_to_v24i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_or_b32_e32 v18, v18, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v14, v14, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v24i16: ; VI: ; %bb.0: @@ -5618,7 +12349,7 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v11 @@ -5657,7 +12388,7 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v15, v2 ; VI-NEXT: v_or_b32_e32 v1, v14, v1 ; VI-NEXT: v_or_b32_e32 v0, v12, v0 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5668,7 +12399,7 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] @@ -5683,7 +12414,7 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5695,7 +12426,7 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] @@ -5709,7 +12440,7 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB29_2: ; %end +; GFX11-NEXT: .LBB58_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5728,3 +12459,347 @@ end: %phi = phi <24 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <24 x i16> %phi } + +define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f16_to_v24i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v9 +; SI-NEXT: v_mov_b32_e32 v12, v8 +; SI-NEXT: v_mov_b32_e32 v13, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_or_b32_e32 v18, v18, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v14, v14, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v24f16_to_v24i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s5, s26, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s5, s27, 16 +; VI-NEXT: v_add_f16_e32 v1, s26, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_lshr_b32 s5, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s25, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s24, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s23, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s22, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s21, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v3, s27, v0 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s20, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s19, 16 +; VI-NEXT: v_or_b32_e32 v11, v3, v4 +; VI-NEXT: v_or_b32_e32 v4, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s19, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_add_f16_e32 v1, s18, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v1, v2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_add_f16_sdwa v13, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v12, s16, v0 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s17, v0 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v12, v13 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v24f16_to_v24i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24f16_to_v24i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_3: +; GFX11-NEXT: s_branch .LBB59_2 +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x half> %a, splat (half 0xH0200) + %a2 = bitcast <24 x half> %a1 to <24 x i16> + br label %end + +cmp.false: + %a3 = bitcast <24 x half> %a to <24 x i16> + br label %end + +end: + %phi = phi <24 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i16> %phi +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FAKE16: {{.*}} +; GFX11-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll index 4b2b9560e5927..25dd5c4e9499f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll @@ -1,37 +1,38 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <14 x float> @bitcast_v14i32_to_v14f32(<14 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i32_to_v14f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i32_to_v14f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i32_to_v14f32: ; VI: ; %bb.0: @@ -131,33 +132,213 @@ end: ret <14 x float> %phi } +define inreg <14 x float> @bitcast_v14i32_to_v14f32_scalar(<14 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i32_to_v14f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v14i32_to_v14f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v14i32_to_v14f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v14i32_to_v14f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i32> %a, splat (i32 3) + %a2 = bitcast <14 x i32> %a1 to <14 x float> + br label %end + +cmp.false: + %a3 = bitcast <14 x i32> %a to <14 x float> + br label %end + +end: + %phi = phi <14 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x float> %phi +} + define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f32_to_v14i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14f32_to_v14i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v14i32: ; VI: ; %bb.0: @@ -166,7 +347,7 @@ define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -182,7 +363,7 @@ define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -193,7 +374,7 @@ define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 ; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -209,7 +390,7 @@ define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -221,7 +402,7 @@ define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 ; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 @@ -230,7 +411,7 @@ define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -250,33 +431,227 @@ end: ret <14 x i32> %phi } +define inreg <14 x i32> @bitcast_v14f32_to_v14i32_scalar(<14 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f32_to_v14i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f32_to_v14i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; VI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f32_to_v14i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX9-NEXT: v_add_f32_e64 v12, s28, 1.0 +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f32_to_v14i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <14 x float> %a1 to <14 x i32> + br label %end + +cmp.false: + %a3 = bitcast <14 x float> %a to <14 x i32> + br label %end + +end: + %phi = phi <14 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i32> %phi +} + define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i32_to_v7i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i32_to_v7i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i32_to_v7i64: ; VI: ; %bb.0: @@ -285,7 +660,7 @@ define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 @@ -301,7 +676,7 @@ define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -312,7 +687,7 @@ define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 ; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 @@ -328,7 +703,7 @@ define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -340,7 +715,7 @@ define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 ; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 @@ -356,7 +731,7 @@ define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -376,33 +751,213 @@ end: ret <7 x i64> %phi } +define inreg <7 x i64> @bitcast_v14i32_to_v7i64_scalar(<14 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i32_to_v7i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v14i32_to_v7i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v14i32_to_v7i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v14i32_to_v7i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i32> %a, splat (i32 3) + %a2 = bitcast <14 x i32> %a1 to <7 x i64> + br label %end + +cmp.false: + %a3 = bitcast <14 x i32> %a to <7 x i64> + br label %end + +end: + %phi = phi <7 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i64> %phi +} + define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v7i64_to_v14i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7i64_to_v14i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i64_to_v14i32: ; VI: ; %bb.0: @@ -411,7 +966,7 @@ define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc @@ -427,7 +982,7 @@ define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -438,7 +993,7 @@ define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc @@ -454,7 +1009,7 @@ define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -466,7 +1021,7 @@ define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -486,7 +1041,7 @@ define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -506,33 +1061,213 @@ end: ret <14 x i32> %phi } +define inreg <14 x i32> @bitcast_v7i64_to_v14i32_scalar(<7 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7i64_to_v14i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v7i64_to_v14i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v7i64_to_v14i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v7i64_to_v14i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_3 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB7_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: s_branch .LBB7_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i64> %a, splat (i64 3) + %a2 = bitcast <7 x i64> %a1 to <14 x i32> + br label %end + +cmp.false: + %a3 = bitcast <7 x i64> %a to <14 x i32> + br label %end + +end: + %phi = phi <14 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i32> %phi +} + define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i32_to_v7f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i32_to_v7f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i32_to_v7f64: ; VI: ; %bb.0: @@ -541,7 +1276,7 @@ define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 @@ -557,7 +1292,7 @@ define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -568,7 +1303,7 @@ define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 ; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 @@ -584,7 +1319,7 @@ define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -596,7 +1331,7 @@ define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 ; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 @@ -612,7 +1347,7 @@ define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -632,26 +1367,206 @@ end: ret <7 x double> %phi } +define inreg <7 x double> @bitcast_v14i32_to_v7f64_scalar(<14 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i32_to_v7f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v14i32_to_v7f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v14i32_to_v7f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v14i32_to_v7f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i32> %a, splat (i32 3) + %a2 = bitcast <14 x i32> %a1 to <7 x double> + br label %end + +cmp.false: + %a3 = bitcast <14 x i32> %a to <7 x double> + br label %end + +end: + %phi = phi <7 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x double> %phi +} + define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v7f64_to_v14i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7f64_to_v14i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v14i32: ; VI: ; %bb.0: @@ -660,7 +1575,7 @@ define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -669,7 +1584,7 @@ define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -680,7 +1595,7 @@ define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -689,7 +1604,7 @@ define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -701,7 +1616,7 @@ define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -710,7 +1625,7 @@ define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -730,92 +1645,258 @@ end: ret <14 x i32> %phi } +define inreg <14 x i32> @bitcast_v7f64_to_v14i32_scalar(<7 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7f64_to_v14i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB11_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_4 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_3: +; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7f64_to_v14i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f64_to_v14i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f64_to_v14i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <7 x double> %a1 to <14 x i32> + br label %end + +cmp.false: + %a3 = bitcast <7 x double> %a to <14 x i32> + br label %end + +end: + %phi = phi <14 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i32> %phi +} + define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i32_to_v28i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v26, v13 -; GCN-NEXT: v_mov_b32_e32 v24, v12 -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v28, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v14, v28 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i32_to_v28i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: v_mov_b32_e32 v24, v12 +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i32_to_v28i16: ; VI: ; %bb.0: @@ -824,7 +1905,7 @@ define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 @@ -840,7 +1921,7 @@ define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: .LBB12_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -851,7 +1932,7 @@ define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 ; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 @@ -867,7 +1948,7 @@ define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB6_2: ; %end +; GFX9-NEXT: .LBB12_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -879,7 +1960,7 @@ define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 ; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 @@ -895,7 +1976,7 @@ define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB6_2: ; %end +; GFX11-NEXT: .LBB12_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -915,183 +1996,426 @@ end: ret <28 x i16> %phi } -define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i16_to_v14i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v36, v14 -; GCN-NEXT: v_mov_b32_e32 v35, v12 -; GCN-NEXT: v_mov_b32_e32 v34, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v32, v6 -; GCN-NEXT: v_mov_b32_e32 v31, v4 -; GCN-NEXT: v_mov_b32_e32 v30, v2 -; GCN-NEXT: v_mov_b32_e32 v29, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v0, v0, v28 -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_or_b32_e32 v3, v3, v39 -; GCN-NEXT: v_or_b32_e32 v4, v4, v48 -; GCN-NEXT: v_or_b32_e32 v5, v5, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_or_b32_e32 v7, v7, v51 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v29 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v0, v28, v0 -; GCN-NEXT: v_or_b32_e32 v1, v37, v1 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_or_b32_e32 v3, v39, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_or_b32_e32 v6, v50, v6 -; GCN-NEXT: v_or_b32_e32 v7, v51, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <28 x i16> @bitcast_v14i32_to_v28i16_scalar(<14 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i32_to_v28i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s28 +; SI-NEXT: v_alignbit_b32 v25, s29, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s27, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_mov_b32_e32 v0, s28 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_alignbit_b32 v25, s29, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s27, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s12 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s8 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v23, s7 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: v_mov_b32_e32 v26, s29 +; SI-NEXT: v_mov_b32_e32 v27, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB13_2 ; -; VI-LABEL: bitcast_v28i16_to_v14i32: +; VI-LABEL: bitcast_v14i32_to_v28i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v15, 3 -; VI-NEXT: v_add_u16_e32 v14, 3, v13 -; VI-NEXT: v_add_u16_sdwa v13, v13, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v14, v13 -; VI-NEXT: v_add_u16_e32 v14, 3, v12 -; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v14, v12 -; VI-NEXT: v_add_u16_e32 v14, 3, v11 -; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v14, v11 -; VI-NEXT: v_add_u16_e32 v14, 3, v10 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v14i32_to_v28i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_v14i32_to_v28i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i32> %a, splat (i32 3) + %a2 = bitcast <14 x i32> %a1 to <28 x i16> + br label %end + +cmp.false: + %a3 = bitcast <14 x i32> %a to <28 x i16> + br label %end + +end: + %phi = phi <28 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i16> %phi +} + +define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v28i16_to_v14i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v31, v4 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v29, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: v_or_b32_e32 v2, v2, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v51 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v12, v12, v19 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v2, v52, v2 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v50, v4 +; SI-NEXT: v_or_b32_e32 v5, v49, v5 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v9, v37, v9 +; SI-NEXT: v_or_b32_e32 v10, v28, v10 +; SI-NEXT: v_or_b32_e32 v11, v21, v11 +; SI-NEXT: v_or_b32_e32 v12, v19, v12 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v28i16_to_v14i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v15, 3 +; VI-NEXT: v_add_u16_e32 v14, 3, v13 +; VI-NEXT: v_add_u16_sdwa v13, v13, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_add_u16_e32 v14, 3, v12 +; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_add_u16_e32 v14, 3, v11 +; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v14, v11 +; VI-NEXT: v_add_u16_e32 v14, 3, v10 ; VI-NEXT: v_add_u16_sdwa v10, v10, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v10, v14, v10 ; VI-NEXT: v_add_u16_e32 v14, 3, v9 @@ -1124,7 +2448,7 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v14, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v14, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1135,7 +2459,7 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -1151,7 +2475,7 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_2: ; %end +; GFX9-NEXT: .LBB14_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1163,7 +2487,7 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -1179,7 +2503,7 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: .LBB14_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1199,180 +2523,523 @@ end: ret <14 x i32> %phi } +define inreg <14 x i32> @bitcast_v28i16_to_v14i32_scalar(<28 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i16_to_v14i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v16, v12 +; SI-NEXT: v_mov_b32_e32 v17, v10 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v12, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v28 +; SI-NEXT: v_or_b32_e32 v13, v0, v23 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v28i16_to_v14i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s26, 3 +; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s40, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s41, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s29, s41, s29 +; VI-NEXT: s_or_b32 s28, s40, s28 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s26, 0x30000 +; VI-NEXT: s_add_i32 s26, s24, 0x30000 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v28i16_to_v14i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28i16_to_v14i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i16> %a, splat (i16 3) + %a2 = bitcast <28 x i16> %a1 to <14 x i32> + br label %end + +cmp.false: + %a3 = bitcast <28 x i16> %a to <14 x i32> + br label %end + +end: + %phi = phi <14 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i32> %phi +} + define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i32_to_v28f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v29, v13 -; GCN-NEXT: v_mov_b32_e32 v30, v12 -; GCN-NEXT: v_mov_b32_e32 v31, v11 -; GCN-NEXT: v_mov_b32_e32 v32, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v9 -; GCN-NEXT: v_mov_b32_e32 v34, v8 -; GCN-NEXT: v_mov_b32_e32 v35, v7 -; GCN-NEXT: v_mov_b32_e32 v36, v6 -; GCN-NEXT: v_mov_b32_e32 v37, v5 -; GCN-NEXT: v_mov_b32_e32 v38, v4 -; GCN-NEXT: v_mov_b32_e32 v39, v3 -; GCN-NEXT: v_mov_b32_e32 v48, v2 -; GCN-NEXT: v_mov_b32_e32 v49, v1 -; GCN-NEXT: v_mov_b32_e32 v28, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i32_to_v28f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v13 +; SI-NEXT: v_mov_b32_e32 v29, v12 +; SI-NEXT: v_mov_b32_e32 v30, v11 +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v28 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i32_to_v28f16: ; VI: ; %bb.0: @@ -1381,7 +3048,7 @@ define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 @@ -1397,7 +3064,7 @@ define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: .LBB16_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1408,7 +3075,7 @@ define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 ; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 @@ -1424,7 +3091,7 @@ define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB8_2: ; %end +; GFX9-NEXT: .LBB16_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1436,7 +3103,7 @@ define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 ; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 @@ -1452,7 +3119,7 @@ define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB8_2: ; %end +; GFX11-NEXT: .LBB16_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1472,223 +3139,505 @@ end: ret <28 x half> %phi } +define inreg <28 x half> @bitcast_v14i32_to_v28f16_scalar(<14 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i32_to_v28f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v14i32_to_v28f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v14i32_to_v28f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v14i32_to_v28f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i32> %a, splat (i32 3) + %a2 = bitcast <14 x i32> %a1 to <28 x half> + br label %end + +cmp.false: + %a3 = bitcast <14 x i32> %a to <28 x half> + br label %end + +end: + %phi = phi <28 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x half> %phi +} + define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f16_to_v14i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v26 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v20 -; GCN-NEXT: v_or_b32_e32 v0, v51, v0 -; GCN-NEXT: v_or_b32_e32 v1, v49, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v37, v3 -; GCN-NEXT: v_or_b32_e32 v4, v35, v4 -; GCN-NEXT: v_or_b32_e32 v5, v33, v5 -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_or_b32_e32 v7, v30, v7 -; GCN-NEXT: v_or_b32_e32 v8, v29, v8 -; GCN-NEXT: v_or_b32_e32 v9, v28, v9 -; GCN-NEXT: v_or_b32_e32 v10, v19, v10 -; GCN-NEXT: v_or_b32_e32 v11, v18, v11 -; GCN-NEXT: v_or_b32_e32 v12, v17, v12 -; GCN-NEXT: v_or_b32_e32 v13, v16, v13 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_or_b32_e32 v8, v24, v22 -; GCN-NEXT: v_or_b32_e32 v9, v26, v25 -; GCN-NEXT: v_or_b32_e32 v10, v19, v27 -; GCN-NEXT: v_or_b32_e32 v11, v18, v23 -; GCN-NEXT: v_or_b32_e32 v12, v17, v21 -; GCN-NEXT: v_or_b32_e32 v13, v16, v20 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f16_to_v14i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v49, v4 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v6, v37, v6 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 +; SI-NEXT: v_or_b32_e32 v8, v33, v8 +; SI-NEXT: v_or_b32_e32 v9, v31, v9 +; SI-NEXT: v_or_b32_e32 v10, v29, v10 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v49 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v33 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v14i32: ; VI: ; %bb.0: @@ -1697,7 +3646,7 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v14, 0x200 ; VI-NEXT: v_add_f16_sdwa v15, v13, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1742,7 +3691,7 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v15 ; VI-NEXT: v_or_b32_e32 v0, v0, v14 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1753,7 +3702,7 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] @@ -1770,7 +3719,7 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: .LBB18_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1782,7 +3731,7 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] @@ -1798,7 +3747,7 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: .LBB18_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1818,33 +3767,423 @@ end: ret <14 x i32> %phi } +define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f16_to_v14i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v10, v22, v10 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v28f16_to_v14i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s29, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s29, v0 +; VI-NEXT: s_lshr_b32 s4, s28, 16 +; VI-NEXT: v_or_b32_e32 v13, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s28, v0 +; VI-NEXT: s_lshr_b32 s4, s27, 16 +; VI-NEXT: v_or_b32_e32 v12, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v14, v1 +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_add_f16_sdwa v14, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v14 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v28f16_to_v14i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s29, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s28, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28f16_to_v14i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x half> %a, splat (half 0xH0200) + %a2 = bitcast <28 x half> %a1 to <14 x i32> + br label %end + +cmp.false: + %a3 = bitcast <28 x half> %a to <14 x i32> + br label %end + +end: + %phi = phi <14 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i32> %phi +} + define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f32_to_v7i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14f32_to_v7i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v7i64: ; VI: ; %bb.0: @@ -1853,7 +4192,7 @@ define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -1869,7 +4208,7 @@ define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1880,7 +4219,7 @@ define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 ; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -1896,7 +4235,7 @@ define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1908,7 +4247,7 @@ define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 ; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 @@ -1917,7 +4256,7 @@ define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1937,33 +4276,227 @@ end: ret <7 x i64> %phi } +define inreg <7 x i64> @bitcast_v14f32_to_v7i64_scalar(<14 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f32_to_v7i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB21_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_4 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_3: +; SI-NEXT: s_branch .LBB21_2 +; SI-NEXT: .LBB21_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f32_to_v7i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB21_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_4 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; VI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_3: +; VI-NEXT: s_branch .LBB21_2 +; VI-NEXT: .LBB21_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f32_to_v7i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_4 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX9-NEXT: v_add_f32_e64 v12, s28, 1.0 +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_3: +; GFX9-NEXT: s_branch .LBB21_2 +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f32_to_v7i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: s_branch .LBB21_2 +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <14 x float> %a1 to <7 x i64> + br label %end + +cmp.false: + %a3 = bitcast <14 x float> %a to <7 x i64> + br label %end + +end: + %phi = phi <7 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i64> %phi +} + define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v7i64_to_v14f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7i64_to_v14f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i64_to_v14f32: ; VI: ; %bb.0: @@ -1972,7 +4505,7 @@ define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc @@ -1988,7 +4521,7 @@ define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1999,7 +4532,7 @@ define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc @@ -2015,7 +4548,7 @@ define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2027,7 +4560,7 @@ define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -2047,7 +4580,7 @@ define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2067,33 +4600,213 @@ end: ret <14 x float> %phi } +define inreg <14 x float> @bitcast_v7i64_to_v14f32_scalar(<7 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7i64_to_v14f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v7i64_to_v14f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v7i64_to_v14f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v7i64_to_v14f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB23_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: s_branch .LBB23_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i64> %a, splat (i64 3) + %a2 = bitcast <7 x i64> %a1 to <14 x float> + br label %end + +cmp.false: + %a3 = bitcast <7 x i64> %a to <14 x float> + br label %end + +end: + %phi = phi <14 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x float> %phi +} + define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f32_to_v7f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14f32_to_v7f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v7f64: ; VI: ; %bb.0: @@ -2102,7 +4815,7 @@ define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -2118,7 +4831,7 @@ define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end +; VI-NEXT: .LBB24_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2129,7 +4842,7 @@ define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 ; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -2145,7 +4858,7 @@ define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: .LBB24_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2157,7 +4870,7 @@ define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 ; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 @@ -2166,7 +4879,7 @@ define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB12_2: ; %end +; GFX11-NEXT: .LBB24_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2186,26 +4899,220 @@ end: ret <7 x double> %phi } +define inreg <7 x double> @bitcast_v14f32_to_v7f64_scalar(<14 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f32_to_v7f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB25_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_4 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_3: +; SI-NEXT: s_branch .LBB25_2 +; SI-NEXT: .LBB25_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f32_to_v7f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB25_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_4 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; VI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_3: +; VI-NEXT: s_branch .LBB25_2 +; VI-NEXT: .LBB25_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f32_to_v7f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_4 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX9-NEXT: v_add_f32_e64 v12, s28, 1.0 +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_3: +; GFX9-NEXT: s_branch .LBB25_2 +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f32_to_v7f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: s_branch .LBB25_2 +; GFX11-NEXT: .LBB25_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <14 x float> %a1 to <7 x double> + br label %end + +cmp.false: + %a3 = bitcast <14 x float> %a to <7 x double> + br label %end + +end: + %phi = phi <7 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x double> %phi +} + define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v7f64_to_v14f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7f64_to_v14f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v14f32: ; VI: ; %bb.0: @@ -2214,7 +5121,7 @@ define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_cbranch_execz .LBB26_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -2223,7 +5130,7 @@ define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2234,7 +5141,7 @@ define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -2243,7 +5150,7 @@ define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2255,7 +5162,7 @@ define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -2264,7 +5171,7 @@ define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2284,92 +5191,258 @@ end: ret <14 x float> %phi } +define inreg <14 x float> @bitcast_v7f64_to_v14f32_scalar(<7 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7f64_to_v14f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB27_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_4 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_3: +; SI-NEXT: s_branch .LBB27_2 +; SI-NEXT: .LBB27_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7f64_to_v14f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB27_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_4 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_3: +; VI-NEXT: s_branch .LBB27_2 +; VI-NEXT: .LBB27_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f64_to_v14f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_4 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_3: +; GFX9-NEXT: s_branch .LBB27_2 +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f64_to_v14f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_4 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: s_branch .LBB27_2 +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <7 x double> %a1 to <14 x float> + br label %end + +cmp.false: + %a3 = bitcast <7 x double> %a to <14 x float> + br label %end + +end: + %phi = phi <14 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x float> %phi +} + define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f32_to_v28i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v26, v13 -; GCN-NEXT: v_mov_b32_e32 v24, v12 -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v28, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v14, v28 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14f32_to_v28i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: v_mov_b32_e32 v24, v12 +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v28i16: ; VI: ; %bb.0: @@ -2378,7 +5451,7 @@ define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -2394,7 +5467,7 @@ define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB14_2: ; %end +; VI-NEXT: .LBB28_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2405,7 +5478,7 @@ define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 ; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -2421,7 +5494,7 @@ define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB14_2: ; %end +; GFX9-NEXT: .LBB28_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2433,7 +5506,7 @@ define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 ; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 @@ -2442,7 +5515,7 @@ define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB14_2: ; %end +; GFX11-NEXT: .LBB28_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2462,162 +5535,410 @@ end: ret <28 x i16> %phi } +define inreg <28 x i16> @bitcast_v14f32_to_v28i16_scalar(<14 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f32_to_v28i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB29_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s28 +; SI-NEXT: v_alignbit_b32 v25, s29, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s11, s27, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB29_4 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: v_mov_b32_e32 v26, s29 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: v_mov_b32_e32 v27, s12 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f32_to_v28i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; VI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f32_to_v28i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX9-NEXT: v_add_f32_e64 v12, s28, 1.0 +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f32_to_v28i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <14 x float> %a1 to <28 x i16> + br label %end + +cmp.false: + %a3 = bitcast <14 x float> %a to <28 x i16> + br label %end + +end: + %phi = phi <28 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i16> %phi +} + define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i16_to_v14f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v36, v14 -; GCN-NEXT: v_mov_b32_e32 v35, v12 -; GCN-NEXT: v_mov_b32_e32 v34, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v32, v6 -; GCN-NEXT: v_mov_b32_e32 v31, v4 -; GCN-NEXT: v_mov_b32_e32 v30, v2 -; GCN-NEXT: v_mov_b32_e32 v29, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_4 -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB15_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v0, v0, v28 -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_or_b32_e32 v3, v3, v39 -; GCN-NEXT: v_or_b32_e32 v4, v4, v48 -; GCN-NEXT: v_or_b32_e32 v5, v5, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_or_b32_e32 v7, v7, v51 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: .LBB15_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v29 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v0, v28, v0 -; GCN-NEXT: v_or_b32_e32 v1, v37, v1 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_or_b32_e32 v3, v39, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_or_b32_e32 v6, v50, v6 -; GCN-NEXT: v_or_b32_e32 v7, v51, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28i16_to_v14f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v31, v4 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v29, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_4 +; SI-NEXT: .LBB30_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB30_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: v_or_b32_e32 v2, v2, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v51 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v12, v12, v19 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: .LBB30_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v2, v52, v2 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v50, v4 +; SI-NEXT: v_or_b32_e32 v5, v49, v5 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v9, v37, v9 +; SI-NEXT: v_or_b32_e32 v10, v28, v10 +; SI-NEXT: v_or_b32_e32 v11, v21, v11 +; SI-NEXT: v_or_b32_e32 v12, v19, v12 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i16_to_v14f32: ; VI: ; %bb.0: @@ -2626,7 +5947,7 @@ define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v15, 3 ; VI-NEXT: v_add_u16_e32 v14, 3, v13 @@ -2671,7 +5992,7 @@ define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v14, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v14, v0 -; VI-NEXT: .LBB15_2: ; %end +; VI-NEXT: .LBB30_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2682,7 +6003,7 @@ define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -2698,36 +6019,379 @@ define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_2: ; %end +; GFX9-NEXT: .LBB30_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v28i16_to_v14f32: +; GFX11-LABEL: bitcast_v28i16_to_v14f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v14 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB30_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i16> %a, splat (i16 3) + %a2 = bitcast <28 x i16> %a1 to <14 x float> + br label %end + +cmp.false: + %a3 = bitcast <28 x i16> %a to <14 x float> + br label %end + +end: + %phi = phi <14 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x float> %phi +} + +define inreg <14 x float> @bitcast_v28i16_to_v14f32_scalar(<28 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i16_to_v14f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v16, v12 +; SI-NEXT: v_mov_b32_e32 v17, v10 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v12, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v28 +; SI-NEXT: v_or_b32_e32 v13, v0, v23 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v28i16_to_v14f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s26, 3 +; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s40, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s41, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s29, s41, s29 +; VI-NEXT: s_or_b32 s28, s40, s28 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s26, 0x30000 +; VI-NEXT: s_add_i32 s26, s24, 0x30000 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v28i16_to_v14f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_4 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_3: +; GFX9-NEXT: s_branch .LBB31_2 +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28i16_to_v14f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v14 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB15_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_3: +; GFX11-NEXT: s_branch .LBB31_2 +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2747,179 +6411,179 @@ end: } define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f32_to_v28f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v29, v13 -; GCN-NEXT: v_mov_b32_e32 v30, v12 -; GCN-NEXT: v_mov_b32_e32 v31, v11 -; GCN-NEXT: v_mov_b32_e32 v32, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v9 -; GCN-NEXT: v_mov_b32_e32 v34, v8 -; GCN-NEXT: v_mov_b32_e32 v35, v7 -; GCN-NEXT: v_mov_b32_e32 v36, v6 -; GCN-NEXT: v_mov_b32_e32 v37, v5 -; GCN-NEXT: v_mov_b32_e32 v38, v4 -; GCN-NEXT: v_mov_b32_e32 v39, v3 -; GCN-NEXT: v_mov_b32_e32 v48, v2 -; GCN-NEXT: v_mov_b32_e32 v49, v1 -; GCN-NEXT: v_mov_b32_e32 v28, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_4 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB16_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: .LBB16_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v49 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v48 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v39 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v38 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v37 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v36 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v35 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v34 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v33 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v32 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v31 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14f32_to_v28f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v13 +; SI-NEXT: v_mov_b32_e32 v29, v12 +; SI-NEXT: v_mov_b32_e32 v30, v11 +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_4 +; SI-NEXT: .LBB32_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB32_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: .LBB32_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v38 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v37 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v34 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v28f16: ; VI: ; %bb.0: @@ -2928,7 +6592,7 @@ define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -2944,7 +6608,7 @@ define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB16_2: ; %end +; VI-NEXT: .LBB32_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2955,7 +6619,7 @@ define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 ; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -2971,7 +6635,7 @@ define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB16_2: ; %end +; GFX9-NEXT: .LBB32_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2983,7 +6647,7 @@ define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 ; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 @@ -2992,7 +6656,7 @@ define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB16_2: ; %end +; GFX11-NEXT: .LBB32_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3012,223 +6676,516 @@ end: ret <28 x half> %phi } +define inreg <28 x half> @bitcast_v14f32_to_v28f16_scalar(<14 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f32_to_v28f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v14f32_to_v28f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; VI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_3: +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f32_to_v28f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX9-NEXT: v_add_f32_e64 v12, s28, 1.0 +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f32_to_v28f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <14 x float> %a1 to <28 x half> + br label %end + +cmp.false: + %a3 = bitcast <14 x float> %a to <28 x half> + br label %end + +end: + %phi = phi <28 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x half> %phi +} + define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f16_to_v14f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v26 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v20 -; GCN-NEXT: v_or_b32_e32 v0, v51, v0 -; GCN-NEXT: v_or_b32_e32 v1, v49, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v37, v3 -; GCN-NEXT: v_or_b32_e32 v4, v35, v4 -; GCN-NEXT: v_or_b32_e32 v5, v33, v5 -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_or_b32_e32 v7, v30, v7 -; GCN-NEXT: v_or_b32_e32 v8, v29, v8 -; GCN-NEXT: v_or_b32_e32 v9, v28, v9 -; GCN-NEXT: v_or_b32_e32 v10, v19, v10 -; GCN-NEXT: v_or_b32_e32 v11, v18, v11 -; GCN-NEXT: v_or_b32_e32 v12, v17, v12 -; GCN-NEXT: v_or_b32_e32 v13, v16, v13 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB17_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_or_b32_e32 v8, v24, v22 -; GCN-NEXT: v_or_b32_e32 v9, v26, v25 -; GCN-NEXT: v_or_b32_e32 v10, v19, v27 -; GCN-NEXT: v_or_b32_e32 v11, v18, v23 -; GCN-NEXT: v_or_b32_e32 v12, v17, v21 -; GCN-NEXT: v_or_b32_e32 v13, v16, v20 -; GCN-NEXT: .LBB17_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f16_to_v14f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v49, v4 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v6, v37, v6 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 +; SI-NEXT: v_or_b32_e32 v8, v33, v8 +; SI-NEXT: v_or_b32_e32 v9, v31, v9 +; SI-NEXT: v_or_b32_e32 v10, v29, v10 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v49 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v33 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v14f32: ; VI: ; %bb.0: @@ -3237,7 +7194,7 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v14, 0x200 ; VI-NEXT: v_add_f16_sdwa v15, v13, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3282,7 +7239,7 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v15 ; VI-NEXT: v_or_b32_e32 v0, v0, v14 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3293,7 +7250,7 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] @@ -3310,7 +7267,7 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: .LBB34_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3322,7 +7279,7 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] @@ -3338,7 +7295,7 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: .LBB34_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3358,33 +7315,423 @@ end: ret <14 x float> %phi } +define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f16_to_v14f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v10, v22, v10 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v28f16_to_v14f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB35_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s29, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s29, v0 +; VI-NEXT: s_lshr_b32 s4, s28, 16 +; VI-NEXT: v_or_b32_e32 v13, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s28, v0 +; VI-NEXT: s_lshr_b32 s4, s27, 16 +; VI-NEXT: v_or_b32_e32 v12, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v14, v1 +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_add_f16_sdwa v14, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v14 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: +; VI-NEXT: s_branch .LBB35_2 +; VI-NEXT: .LBB35_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v28f16_to_v14f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s29, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s28, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: +; GFX9-NEXT: s_branch .LBB35_2 +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28f16_to_v14f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x half> %a, splat (half 0xH0200) + %a2 = bitcast <28 x half> %a1 to <14 x float> + br label %end + +cmp.false: + %a3 = bitcast <28 x half> %a to <14 x float> + br label %end + +end: + %phi = phi <14 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x float> %phi +} + define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v7i64_to_v7f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7i64_to_v7f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i64_to_v7f64: ; VI: ; %bb.0: @@ -3393,7 +7740,7 @@ define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: s_cbranch_execz .LBB36_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -3409,7 +7756,7 @@ define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: .LBB18_2: ; %end +; VI-NEXT: .LBB36_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3420,7 +7767,7 @@ define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: s_cbranch_execz .LBB36_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -3436,7 +7783,7 @@ define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: .LBB18_2: ; %end +; GFX9-NEXT: .LBB36_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3448,7 +7795,7 @@ define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -3468,7 +7815,7 @@ define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: .LBB18_2: ; %end +; GFX11-NEXT: .LBB36_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3488,26 +7835,205 @@ end: ret <7 x double> %phi } +define inreg <7 x double> @bitcast_v7i64_to_v7f64_scalar(<7 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7i64_to_v7f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 +; +; VI-LABEL: bitcast_v7i64_to_v7f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: .LBB37_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 +; +; GFX9-LABEL: bitcast_v7i64_to_v7f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: .LBB37_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 +; +; GFX11-LABEL: bitcast_v7i64_to_v7f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_3 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: .LBB37_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: s_branch .LBB37_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i64> %a, splat (i64 3) + %a2 = bitcast <7 x i64> %a1 to <7 x double> + br label %end + +cmp.false: + %a3 = bitcast <7 x i64> %a to <7 x double> + br label %end + +end: + %phi = phi <7 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x double> %phi +} + define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v7f64_to_v7i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7f64_to_v7i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v7i64: ; VI: ; %bb.0: @@ -3516,7 +8042,7 @@ define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -3525,7 +8051,7 @@ define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3536,7 +8062,7 @@ define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -3545,7 +8071,7 @@ define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end +; GFX9-NEXT: .LBB38_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3557,7 +8083,7 @@ define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -3566,7 +8092,7 @@ define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3586,92 +8112,258 @@ end: ret <7 x i64> %phi } +define inreg <7 x i64> @bitcast_v7f64_to_v7i64_scalar(<7 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7f64_to_v7i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB39_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_4 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_3: +; SI-NEXT: s_branch .LBB39_2 +; SI-NEXT: .LBB39_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7f64_to_v7i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB39_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_4 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_3: +; VI-NEXT: s_branch .LBB39_2 +; VI-NEXT: .LBB39_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f64_to_v7i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_4 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_3: +; GFX9-NEXT: s_branch .LBB39_2 +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f64_to_v7i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: s_branch .LBB39_2 +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <7 x double> %a1 to <7 x i64> + br label %end + +cmp.false: + %a3 = bitcast <7 x double> %a to <7 x i64> + br label %end + +end: + %phi = phi <7 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i64> %phi +} + define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v7i64_to_v28i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v26, v13 -; GCN-NEXT: v_mov_b32_e32 v24, v12 -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v28, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v14, v28 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7i64_to_v28i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: v_mov_b32_e32 v24, v12 +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i64_to_v28i16: ; VI: ; %bb.0: @@ -3680,7 +8372,7 @@ define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc @@ -3696,7 +8388,7 @@ define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: .LBB40_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3707,7 +8399,7 @@ define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc @@ -3723,7 +8415,7 @@ define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: .LBB40_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3735,7 +8427,7 @@ define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -3755,7 +8447,7 @@ define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: .LBB40_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3775,162 +8467,405 @@ end: ret <28 x i16> %phi } +define inreg <28 x i16> @bitcast_v7i64_to_v28i16_scalar(<7 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7i64_to_v28i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s28 +; SI-NEXT: v_alignbit_b32 v25, s29, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s27, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: v_mov_b32_e32 v0, s28 +; SI-NEXT: v_alignbit_b32 v25, s29, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s27, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s12 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s8 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v23, s7 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: v_mov_b32_e32 v26, s29 +; SI-NEXT: v_mov_b32_e32 v27, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v7i64_to_v28i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v7i64_to_v28i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-LABEL: bitcast_v7i64_to_v28i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB41_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: s_branch .LBB41_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i64> %a, splat (i64 3) + %a2 = bitcast <7 x i64> %a1 to <28 x i16> + br label %end + +cmp.false: + %a3 = bitcast <7 x i64> %a to <28 x i16> + br label %end + +end: + %phi = phi <28 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i16> %phi +} + define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i16_to_v7i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v36, v14 -; GCN-NEXT: v_mov_b32_e32 v35, v12 -; GCN-NEXT: v_mov_b32_e32 v34, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v32, v6 -; GCN-NEXT: v_mov_b32_e32 v31, v4 -; GCN-NEXT: v_mov_b32_e32 v30, v2 -; GCN-NEXT: v_mov_b32_e32 v29, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_4 -; GCN-NEXT: .LBB21_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB21_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v0, v0, v28 -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_or_b32_e32 v3, v3, v39 -; GCN-NEXT: v_or_b32_e32 v4, v4, v48 -; GCN-NEXT: v_or_b32_e32 v5, v5, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_or_b32_e32 v7, v7, v51 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: .LBB21_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v29 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v0, v28, v0 -; GCN-NEXT: v_or_b32_e32 v1, v37, v1 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_or_b32_e32 v3, v39, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_or_b32_e32 v6, v50, v6 -; GCN-NEXT: v_or_b32_e32 v7, v51, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28i16_to_v7i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v31, v4 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v29, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_4 +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB42_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: v_or_b32_e32 v2, v2, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v51 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v12, v12, v19 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: .LBB42_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v2, v52, v2 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v50, v4 +; SI-NEXT: v_or_b32_e32 v5, v49, v5 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v9, v37, v9 +; SI-NEXT: v_or_b32_e32 v10, v28, v10 +; SI-NEXT: v_or_b32_e32 v11, v21, v11 +; SI-NEXT: v_or_b32_e32 v12, v19, v12 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i16_to_v7i64: ; VI: ; %bb.0: @@ -3939,7 +8874,7 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v15, 3 ; VI-NEXT: v_add_u16_e32 v14, 3, v13 @@ -3984,7 +8919,7 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v14, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v14, v0 -; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: .LBB42_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3995,7 +8930,7 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -4011,7 +8946,7 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_2: ; %end +; GFX9-NEXT: .LBB42_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4023,7 +8958,7 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -4039,7 +8974,7 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB21_2: ; %end +; GFX11-NEXT: .LBB42_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4059,180 +8994,523 @@ end: ret <7 x i64> %phi } +define inreg <7 x i64> @bitcast_v28i16_to_v7i64_scalar(<28 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i16_to_v7i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v16, v12 +; SI-NEXT: v_mov_b32_e32 v17, v10 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v12, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v28 +; SI-NEXT: v_or_b32_e32 v13, v0, v23 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v28i16_to_v7i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s26, 3 +; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s40, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s41, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s29, s41, s29 +; VI-NEXT: s_or_b32 s28, s40, s28 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s26, 0x30000 +; VI-NEXT: s_add_i32 s26, s24, 0x30000 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v28i16_to_v7i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_4 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_3: +; GFX9-NEXT: s_branch .LBB43_2 +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28i16_to_v7i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: +; GFX11-NEXT: s_branch .LBB43_2 +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i16> %a, splat (i16 3) + %a2 = bitcast <28 x i16> %a1 to <7 x i64> + br label %end + +cmp.false: + %a3 = bitcast <28 x i16> %a to <7 x i64> + br label %end + +end: + %phi = phi <7 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i64> %phi +} + define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v7i64_to_v28f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v30, v13 -; GCN-NEXT: v_mov_b32_e32 v29, v12 -; GCN-NEXT: v_mov_b32_e32 v32, v11 -; GCN-NEXT: v_mov_b32_e32 v31, v10 -; GCN-NEXT: v_mov_b32_e32 v34, v9 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v36, v7 -; GCN-NEXT: v_mov_b32_e32 v35, v6 -; GCN-NEXT: v_mov_b32_e32 v38, v5 -; GCN-NEXT: v_mov_b32_e32 v37, v4 -; GCN-NEXT: v_mov_b32_e32 v48, v3 -; GCN-NEXT: v_mov_b32_e32 v39, v2 -; GCN-NEXT: v_mov_b32_e32 v49, v1 -; GCN-NEXT: v_mov_b32_e32 v28, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_4 -; GCN-NEXT: .LBB22_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB22_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: .LBB22_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v49, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v39 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v48, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v37 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v38, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v35 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v36, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v33 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v34, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v31 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v32, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v29 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v30, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7i64_to_v28f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v29, v13 +; SI-NEXT: v_mov_b32_e32 v28, v12 +; SI-NEXT: v_mov_b32_e32 v31, v11 +; SI-NEXT: v_mov_b32_e32 v30, v10 +; SI-NEXT: v_mov_b32_e32 v33, v9 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: v_mov_b32_e32 v35, v7 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v37, v5 +; SI-NEXT: v_mov_b32_e32 v36, v4 +; SI-NEXT: v_mov_b32_e32 v39, v3 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_mov_b32_e32 v49, v1 +; SI-NEXT: v_mov_b32_e32 v48, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_4 +; SI-NEXT: .LBB44_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB44_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: .LBB44_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v49, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v38 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v39, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v36 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v37, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v34 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v35, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v33, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v30 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v31, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v29, vcc +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i64_to_v28f16: ; VI: ; %bb.0: @@ -4241,7 +9519,7 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc @@ -4257,7 +9535,7 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: .LBB44_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4268,7 +9546,7 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc @@ -4284,7 +9562,7 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: .LBB44_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4296,7 +9574,7 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -4316,7 +9594,7 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: .LBB44_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4336,223 +9614,505 @@ end: ret <28 x half> %phi } +define inreg <28 x half> @bitcast_v7i64_to_v28f16_scalar(<7 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7i64_to_v28f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_add_u32 s8, s18, 3 +; SI-NEXT: s_addc_u32 s9, s19, 0 +; SI-NEXT: s_lshr_b32 s10, s8, 16 +; SI-NEXT: s_lshr_b32 s11, s9, 16 +; SI-NEXT: s_add_u32 s12, s20, 3 +; SI-NEXT: s_addc_u32 s13, s21, 0 +; SI-NEXT: s_lshr_b32 s14, s12, 16 +; SI-NEXT: s_lshr_b32 s15, s13, 16 +; SI-NEXT: s_add_u32 s16, s22, 3 +; SI-NEXT: s_addc_u32 s17, s23, 0 +; SI-NEXT: s_lshr_b32 s18, s16, 16 +; SI-NEXT: s_lshr_b32 s19, s17, 16 +; SI-NEXT: s_add_u32 s20, s24, 3 +; SI-NEXT: s_addc_u32 s21, s25, 0 +; SI-NEXT: s_lshr_b32 s22, s20, 16 +; SI-NEXT: s_lshr_b32 s23, s21, 16 +; SI-NEXT: s_add_u32 s24, s26, 3 +; SI-NEXT: s_addc_u32 s25, s27, 0 +; SI-NEXT: s_lshr_b32 s26, s24, 16 +; SI-NEXT: s_lshr_b32 s27, s25, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s40, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v7i64_to_v28f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v7i64_to_v28f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-LABEL: bitcast_v7i64_to_v28f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB45_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: s_branch .LBB45_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i64> %a, splat (i64 3) + %a2 = bitcast <7 x i64> %a1 to <28 x half> + br label %end + +cmp.false: + %a3 = bitcast <7 x i64> %a to <28 x half> + br label %end + +end: + %phi = phi <28 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x half> %phi +} + define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f16_to_v7i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v26 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v20 -; GCN-NEXT: v_or_b32_e32 v0, v51, v0 -; GCN-NEXT: v_or_b32_e32 v1, v49, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v37, v3 -; GCN-NEXT: v_or_b32_e32 v4, v35, v4 -; GCN-NEXT: v_or_b32_e32 v5, v33, v5 -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_or_b32_e32 v7, v30, v7 -; GCN-NEXT: v_or_b32_e32 v8, v29, v8 -; GCN-NEXT: v_or_b32_e32 v9, v28, v9 -; GCN-NEXT: v_or_b32_e32 v10, v19, v10 -; GCN-NEXT: v_or_b32_e32 v11, v18, v11 -; GCN-NEXT: v_or_b32_e32 v12, v17, v12 -; GCN-NEXT: v_or_b32_e32 v13, v16, v13 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_or_b32_e32 v8, v24, v22 -; GCN-NEXT: v_or_b32_e32 v9, v26, v25 -; GCN-NEXT: v_or_b32_e32 v10, v19, v27 -; GCN-NEXT: v_or_b32_e32 v11, v18, v23 -; GCN-NEXT: v_or_b32_e32 v12, v17, v21 -; GCN-NEXT: v_or_b32_e32 v13, v16, v20 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f16_to_v7i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v49, v4 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v6, v37, v6 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 +; SI-NEXT: v_or_b32_e32 v8, v33, v8 +; SI-NEXT: v_or_b32_e32 v9, v31, v9 +; SI-NEXT: v_or_b32_e32 v10, v29, v10 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v49 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v33 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v7i64: ; VI: ; %bb.0: @@ -4561,7 +10121,7 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v14, 0x200 ; VI-NEXT: v_add_f16_sdwa v15, v13, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -4606,7 +10166,7 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v15 ; VI-NEXT: v_or_b32_e32 v0, v0, v14 -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4617,7 +10177,7 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] @@ -4634,7 +10194,7 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: .LBB46_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4646,7 +10206,7 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] @@ -4662,7 +10222,7 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB23_2: ; %end +; GFX11-NEXT: .LBB46_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4682,97 +10242,487 @@ end: ret <7 x i64> %phi } +define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f16_to_v7i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v10, v22, v10 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v28f16_to_v7i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s29, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s29, v0 +; VI-NEXT: s_lshr_b32 s4, s28, 16 +; VI-NEXT: v_or_b32_e32 v13, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s28, v0 +; VI-NEXT: s_lshr_b32 s4, s27, 16 +; VI-NEXT: v_or_b32_e32 v12, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v14, v1 +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_add_f16_sdwa v14, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v14 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v28f16_to_v7i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s29, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s28, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28f16_to_v7i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_3: +; GFX11-NEXT: s_branch .LBB47_2 +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x half> %a, splat (half 0xH0200) + %a2 = bitcast <28 x half> %a1 to <7 x i64> + br label %end + +cmp.false: + %a3 = bitcast <28 x half> %a to <7 x i64> + br label %end + +end: + %phi = phi <7 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i64> %phi +} + define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v7f64_to_v28i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v49, v13 -; GCN-NEXT: v_mov_b32_e32 v48, v12 -; GCN-NEXT: v_mov_b32_e32 v38, v11 -; GCN-NEXT: v_mov_b32_e32 v37, v10 -; GCN-NEXT: v_mov_b32_e32 v36, v9 -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v7 -; GCN-NEXT: v_mov_b32_e32 v33, v6 -; GCN-NEXT: v_mov_b32_e32 v32, v5 -; GCN-NEXT: v_mov_b32_e32 v31, v4 -; GCN-NEXT: v_mov_b32_e32 v30, v3 -; GCN-NEXT: v_mov_b32_e32 v29, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v25, v49, v48, 16 -; GCN-NEXT: v_alignbit_b32 v21, v38, v37, 16 -; GCN-NEXT: v_alignbit_b32 v17, v36, v35, 16 -; GCN-NEXT: v_alignbit_b32 v13, v34, v33, 16 -; GCN-NEXT: v_alignbit_b32 v9, v32, v31, 16 -; GCN-NEXT: v_alignbit_b32 v5, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v28, v1, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; GCN-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; GCN-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 -; GCN-NEXT: v_add_f64 v[35:36], v[35:36], 1.0 -; GCN-NEXT: v_add_f64 v[37:38], v[37:38], 1.0 -; GCN-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 -; GCN-NEXT: v_alignbit_b32 v25, v49, v48, 16 -; GCN-NEXT: v_alignbit_b32 v21, v38, v37, 16 -; GCN-NEXT: v_alignbit_b32 v17, v36, v35, 16 -; GCN-NEXT: v_alignbit_b32 v13, v34, v33, 16 -; GCN-NEXT: v_alignbit_b32 v9, v32, v31, 16 -; GCN-NEXT: v_alignbit_b32 v5, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v28, v1, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v29 -; GCN-NEXT: v_mov_b32_e32 v6, v30 -; GCN-NEXT: v_mov_b32_e32 v8, v31 -; GCN-NEXT: v_mov_b32_e32 v10, v32 -; GCN-NEXT: v_mov_b32_e32 v12, v33 -; GCN-NEXT: v_mov_b32_e32 v14, v34 -; GCN-NEXT: v_mov_b32_e32 v16, v35 -; GCN-NEXT: v_mov_b32_e32 v18, v36 -; GCN-NEXT: v_mov_b32_e32 v20, v37 -; GCN-NEXT: v_mov_b32_e32 v22, v38 -; GCN-NEXT: v_mov_b32_e32 v24, v48 -; GCN-NEXT: v_mov_b32_e32 v26, v49 -; GCN-NEXT: v_mov_b32_e32 v1, v28 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7f64_to_v28i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v36, v9 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v32, v5 +; SI-NEXT: v_mov_b32_e32 v31, v4 +; SI-NEXT: v_mov_b32_e32 v30, v3 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v49, v48, 16 +; SI-NEXT: v_alignbit_b32 v21, v38, v37, 16 +; SI-NEXT: v_alignbit_b32 v17, v36, v35, 16 +; SI-NEXT: v_alignbit_b32 v13, v34, v33, 16 +; SI-NEXT: v_alignbit_b32 v9, v32, v31, 16 +; SI-NEXT: v_alignbit_b32 v5, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v28, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 +; SI-NEXT: v_add_f64 v[35:36], v[35:36], 1.0 +; SI-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 +; SI-NEXT: v_add_f64 v[37:38], v[37:38], 1.0 +; SI-NEXT: v_alignbit_b32 v25, v49, v48, 16 +; SI-NEXT: v_alignbit_b32 v21, v38, v37, 16 +; SI-NEXT: v_alignbit_b32 v17, v36, v35, 16 +; SI-NEXT: v_alignbit_b32 v13, v34, v33, 16 +; SI-NEXT: v_alignbit_b32 v9, v32, v31, 16 +; SI-NEXT: v_alignbit_b32 v5, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v28, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v4, v29 +; SI-NEXT: v_mov_b32_e32 v6, v30 +; SI-NEXT: v_mov_b32_e32 v8, v31 +; SI-NEXT: v_mov_b32_e32 v10, v32 +; SI-NEXT: v_mov_b32_e32 v12, v33 +; SI-NEXT: v_mov_b32_e32 v14, v34 +; SI-NEXT: v_mov_b32_e32 v16, v35 +; SI-NEXT: v_mov_b32_e32 v18, v36 +; SI-NEXT: v_mov_b32_e32 v20, v37 +; SI-NEXT: v_mov_b32_e32 v22, v38 +; SI-NEXT: v_mov_b32_e32 v24, v48 +; SI-NEXT: v_mov_b32_e32 v26, v49 +; SI-NEXT: v_mov_b32_e32 v1, v28 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v28i16: ; VI: ; %bb.0: @@ -4781,7 +10731,7 @@ define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -4790,7 +10740,7 @@ define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: .LBB48_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4801,7 +10751,7 @@ define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -4810,7 +10760,7 @@ define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: .LBB48_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4822,7 +10772,7 @@ define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -4831,7 +10781,7 @@ define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: .LBB48_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4851,162 +10801,397 @@ end: ret <28 x i16> %phi } +define inreg <28 x i16> @bitcast_v7f64_to_v28i16_scalar(<7 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7f64_to_v28i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB49_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s28 +; SI-NEXT: v_alignbit_b32 v28, s29, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v29, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v30, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v31, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v32, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v33, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v34, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s11, s27, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB49_4 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[16:17], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[24:25], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[20:21], s[26:27], 1.0 +; SI-NEXT: v_alignbit_b32 v28, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v29, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v32, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: s_branch .LBB49_5 +; SI-NEXT: .LBB49_3: +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: s_branch .LBB49_2 +; SI-NEXT: .LBB49_4: +; SI-NEXT: v_mov_b32_e32 v5, s19 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v21, s27 +; SI-NEXT: v_mov_b32_e32 v25, s29 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: v_mov_b32_e32 v27, s12 +; SI-NEXT: .LBB49_5: ; %end +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v6, v5 +; SI-NEXT: v_mov_b32_e32 v10, v9 +; SI-NEXT: v_mov_b32_e32 v14, v13 +; SI-NEXT: v_mov_b32_e32 v18, v17 +; SI-NEXT: v_mov_b32_e32 v22, v21 +; SI-NEXT: v_mov_b32_e32 v26, v25 +; SI-NEXT: v_mov_b32_e32 v1, v34 +; SI-NEXT: v_mov_b32_e32 v5, v33 +; SI-NEXT: v_mov_b32_e32 v9, v32 +; SI-NEXT: v_mov_b32_e32 v13, v31 +; SI-NEXT: v_mov_b32_e32 v17, v30 +; SI-NEXT: v_mov_b32_e32 v21, v29 +; SI-NEXT: v_mov_b32_e32 v25, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7f64_to_v28i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB49_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB49_4 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_3: +; VI-NEXT: s_branch .LBB49_2 +; VI-NEXT: .LBB49_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f64_to_v28i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f64_to_v28i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-NEXT: .LBB49_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB49_3: +; GFX11-NEXT: s_branch .LBB49_2 +; GFX11-NEXT: .LBB49_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <7 x double> %a1 to <28 x i16> + br label %end + +cmp.false: + %a3 = bitcast <7 x double> %a to <28 x i16> + br label %end + +end: + %phi = phi <28 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i16> %phi +} + define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i16_to_v7f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v36, v14 -; GCN-NEXT: v_mov_b32_e32 v35, v12 -; GCN-NEXT: v_mov_b32_e32 v34, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v32, v6 -; GCN-NEXT: v_mov_b32_e32 v31, v4 -; GCN-NEXT: v_mov_b32_e32 v30, v2 -; GCN-NEXT: v_mov_b32_e32 v29, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_4 -; GCN-NEXT: .LBB25_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB25_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v0, v0, v28 -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_or_b32_e32 v3, v3, v39 -; GCN-NEXT: v_or_b32_e32 v4, v4, v48 -; GCN-NEXT: v_or_b32_e32 v5, v5, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_or_b32_e32 v7, v7, v51 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: .LBB25_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v29 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v0, v28, v0 -; GCN-NEXT: v_or_b32_e32 v1, v37, v1 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_or_b32_e32 v3, v39, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_or_b32_e32 v6, v50, v6 -; GCN-NEXT: v_or_b32_e32 v7, v51, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28i16_to_v7f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v31, v4 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v29, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_4 +; SI-NEXT: .LBB50_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB50_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: v_or_b32_e32 v2, v2, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v51 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v12, v12, v19 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: .LBB50_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v2, v52, v2 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v50, v4 +; SI-NEXT: v_or_b32_e32 v5, v49, v5 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v9, v37, v9 +; SI-NEXT: v_or_b32_e32 v10, v28, v10 +; SI-NEXT: v_or_b32_e32 v11, v21, v11 +; SI-NEXT: v_or_b32_e32 v12, v19, v12 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i16_to_v7f64: ; VI: ; %bb.0: @@ -5015,7 +11200,7 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v15, 3 ; VI-NEXT: v_add_u16_e32 v14, 3, v13 @@ -5060,7 +11245,7 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v14, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v14, v0 -; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5071,7 +11256,7 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -5087,7 +11272,7 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: .LBB50_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5099,7 +11284,7 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -5115,7 +11300,7 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB25_2: ; %end +; GFX11-NEXT: .LBB50_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5135,162 +11320,505 @@ end: ret <7 x double> %phi } +define inreg <7 x double> @bitcast_v28i16_to_v7f64_scalar(<28 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i16_to_v7f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v16, v12 +; SI-NEXT: v_mov_b32_e32 v17, v10 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v12, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v28 +; SI-NEXT: v_or_b32_e32 v13, v0, v23 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v28i16_to_v7f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s26, 3 +; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s40, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s41, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s29, s41, s29 +; VI-NEXT: s_or_b32 s28, s40, s28 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s26, 0x30000 +; VI-NEXT: s_add_i32 s26, s24, 0x30000 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v28i16_to_v7f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB51_4 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_3: +; GFX9-NEXT: s_branch .LBB51_2 +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28i16_to_v7f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB51_4 +; GFX11-NEXT: .LBB51_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB51_3: +; GFX11-NEXT: s_branch .LBB51_2 +; GFX11-NEXT: .LBB51_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i16> %a, splat (i16 3) + %a2 = bitcast <28 x i16> %a1 to <7 x double> + br label %end + +cmp.false: + %a3 = bitcast <28 x i16> %a to <7 x double> + br label %end + +end: + %phi = phi <7 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x double> %phi +} + define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v7f64_to_v28f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v34 -; GCN-NEXT: v_mov_b32_e32 v1, v49 -; GCN-NEXT: v_mov_b32_e32 v2, v28 -; GCN-NEXT: v_mov_b32_e32 v3, v48 -; GCN-NEXT: v_mov_b32_e32 v4, v29 -; GCN-NEXT: v_mov_b32_e32 v5, v39 -; GCN-NEXT: v_mov_b32_e32 v6, v30 -; GCN-NEXT: v_mov_b32_e32 v7, v38 -; GCN-NEXT: v_mov_b32_e32 v8, v31 -; GCN-NEXT: v_mov_b32_e32 v9, v37 -; GCN-NEXT: v_mov_b32_e32 v10, v32 -; GCN-NEXT: v_mov_b32_e32 v11, v36 -; GCN-NEXT: v_mov_b32_e32 v12, v33 -; GCN-NEXT: v_mov_b32_e32 v13, v35 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7f64_to_v28f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v48 +; SI-NEXT: v_mov_b32_e32 v1, v49 +; SI-NEXT: v_mov_b32_e32 v2, v39 +; SI-NEXT: v_mov_b32_e32 v3, v38 +; SI-NEXT: v_mov_b32_e32 v4, v37 +; SI-NEXT: v_mov_b32_e32 v5, v35 +; SI-NEXT: v_mov_b32_e32 v6, v36 +; SI-NEXT: v_mov_b32_e32 v7, v33 +; SI-NEXT: v_mov_b32_e32 v8, v34 +; SI-NEXT: v_mov_b32_e32 v9, v30 +; SI-NEXT: v_mov_b32_e32 v10, v32 +; SI-NEXT: v_mov_b32_e32 v11, v28 +; SI-NEXT: v_mov_b32_e32 v12, v31 +; SI-NEXT: v_mov_b32_e32 v13, v29 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v28f16: ; VI: ; %bb.0: @@ -5299,7 +11827,7 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -5308,7 +11836,7 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: .LBB52_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5319,7 +11847,7 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -5328,7 +11856,7 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: .LBB52_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5340,7 +11868,7 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -5349,7 +11877,7 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: .LBB52_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5369,223 +11897,488 @@ end: ret <28 x half> %phi } +define inreg <28 x half> @bitcast_v7f64_to_v28f16_scalar(<7 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7f64_to_v28f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v7f64_to_v28f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB53_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_4 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_3: +; VI-NEXT: s_branch .LBB53_2 +; VI-NEXT: .LBB53_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f64_to_v28f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_4 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_3: +; GFX9-NEXT: s_branch .LBB53_2 +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f64_to_v28f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_4 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_3: +; GFX11-NEXT: s_branch .LBB53_2 +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <7 x double> %a1 to <28 x half> + br label %end + +cmp.false: + %a3 = bitcast <7 x double> %a to <28 x half> + br label %end + +end: + %phi = phi <28 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x half> %phi +} + define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f16_to_v7f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v26 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v20 -; GCN-NEXT: v_or_b32_e32 v0, v51, v0 -; GCN-NEXT: v_or_b32_e32 v1, v49, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v37, v3 -; GCN-NEXT: v_or_b32_e32 v4, v35, v4 -; GCN-NEXT: v_or_b32_e32 v5, v33, v5 -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_or_b32_e32 v7, v30, v7 -; GCN-NEXT: v_or_b32_e32 v8, v29, v8 -; GCN-NEXT: v_or_b32_e32 v9, v28, v9 -; GCN-NEXT: v_or_b32_e32 v10, v19, v10 -; GCN-NEXT: v_or_b32_e32 v11, v18, v11 -; GCN-NEXT: v_or_b32_e32 v12, v17, v12 -; GCN-NEXT: v_or_b32_e32 v13, v16, v13 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_or_b32_e32 v8, v24, v22 -; GCN-NEXT: v_or_b32_e32 v9, v26, v25 -; GCN-NEXT: v_or_b32_e32 v10, v19, v27 -; GCN-NEXT: v_or_b32_e32 v11, v18, v23 -; GCN-NEXT: v_or_b32_e32 v12, v17, v21 -; GCN-NEXT: v_or_b32_e32 v13, v16, v20 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f16_to_v7f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v49, v4 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v6, v37, v6 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 +; SI-NEXT: v_or_b32_e32 v8, v33, v8 +; SI-NEXT: v_or_b32_e32 v9, v31, v9 +; SI-NEXT: v_or_b32_e32 v10, v29, v10 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v49 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v33 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v7f64: ; VI: ; %bb.0: @@ -5594,7 +12387,7 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v14, 0x200 ; VI-NEXT: v_add_f16_sdwa v15, v13, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -5639,7 +12432,7 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v15 ; VI-NEXT: v_or_b32_e32 v0, v0, v14 -; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5650,7 +12443,7 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] @@ -5667,7 +12460,7 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_2: ; %end +; GFX9-NEXT: .LBB54_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5679,7 +12472,7 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] @@ -5695,7 +12488,7 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB27_2: ; %end +; GFX11-NEXT: .LBB54_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5715,211 +12508,601 @@ end: ret <7 x double> %phi } +define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f16_to_v7f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v10, v22, v10 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v28f16_to_v7f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB55_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_4 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s29, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s29, v0 +; VI-NEXT: s_lshr_b32 s4, s28, 16 +; VI-NEXT: v_or_b32_e32 v13, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s28, v0 +; VI-NEXT: s_lshr_b32 s4, s27, 16 +; VI-NEXT: v_or_b32_e32 v12, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v14, v1 +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_add_f16_sdwa v14, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v14 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_3: +; VI-NEXT: s_branch .LBB55_2 +; VI-NEXT: .LBB55_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v28f16_to_v7f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s29, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s28, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: +; GFX9-NEXT: s_branch .LBB55_2 +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28f16_to_v7f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x half> %a, splat (half 0xH0200) + %a2 = bitcast <28 x half> %a1 to <7 x double> + br label %end + +cmp.false: + %a3 = bitcast <28 x half> %a to <7 x double> + br label %end + +end: + %phi = phi <7 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x double> %phi +} + define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i16_to_v28f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_mov_b32_e32 v47, v27 -; GCN-NEXT: v_mov_b32_e32 v46, v26 -; GCN-NEXT: v_mov_b32_e32 v45, v25 -; GCN-NEXT: v_mov_b32_e32 v44, v24 -; GCN-NEXT: v_mov_b32_e32 v43, v23 -; GCN-NEXT: v_mov_b32_e32 v42, v22 -; GCN-NEXT: v_mov_b32_e32 v41, v21 -; GCN-NEXT: v_mov_b32_e32 v40, v20 -; GCN-NEXT: v_mov_b32_e32 v55, v19 -; GCN-NEXT: v_mov_b32_e32 v54, v18 -; GCN-NEXT: v_mov_b32_e32 v53, v17 -; GCN-NEXT: v_mov_b32_e32 v52, v16 -; GCN-NEXT: v_mov_b32_e32 v51, v15 -; GCN-NEXT: v_mov_b32_e32 v50, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v13 -; GCN-NEXT: v_mov_b32_e32 v48, v12 -; GCN-NEXT: v_mov_b32_e32 v39, v11 -; GCN-NEXT: v_mov_b32_e32 v38, v10 -; GCN-NEXT: v_mov_b32_e32 v37, v9 -; GCN-NEXT: v_mov_b32_e32 v36, v8 -; GCN-NEXT: v_mov_b32_e32 v35, v7 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v5 -; GCN-NEXT: v_mov_b32_e32 v32, v4 -; GCN-NEXT: v_mov_b32_e32 v31, v3 -; GCN-NEXT: v_mov_b32_e32 v30, v2 -; GCN-NEXT: v_mov_b32_e32 v29, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v56, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28i16_to_v28f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v27 +; SI-NEXT: v_mov_b32_e32 v46, v26 +; SI-NEXT: v_mov_b32_e32 v45, v25 +; SI-NEXT: v_mov_b32_e32 v44, v24 +; SI-NEXT: v_mov_b32_e32 v43, v23 +; SI-NEXT: v_mov_b32_e32 v42, v22 +; SI-NEXT: v_mov_b32_e32 v41, v21 +; SI-NEXT: v_mov_b32_e32 v40, v20 +; SI-NEXT: v_mov_b32_e32 v55, v19 +; SI-NEXT: v_mov_b32_e32 v54, v18 +; SI-NEXT: v_mov_b32_e32 v53, v17 +; SI-NEXT: v_mov_b32_e32 v52, v16 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_mov_b32_e32 v50, v14 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_mov_b32_e32 v38, v10 +; SI-NEXT: v_mov_b32_e32 v37, v9 +; SI-NEXT: v_mov_b32_e32 v36, v8 +; SI-NEXT: v_mov_b32_e32 v35, v7 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v31, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v29, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v56, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i16_to_v28f16: ; VI: ; %bb.0: @@ -5928,7 +13111,7 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v14, 3 ; VI-NEXT: v_add_u16_sdwa v19, v13, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -5973,7 +13156,7 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v17 ; VI-NEXT: v_or_b32_e32 v1, v1, v16 ; VI-NEXT: v_or_b32_e32 v0, v0, v15 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5984,7 +13167,7 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -6000,7 +13183,7 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6012,7 +13195,7 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -6028,7 +13211,7 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB28_2: ; %end +; GFX11-NEXT: .LBB56_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6048,166 +13231,508 @@ end: ret <28 x half> %phi } +define inreg <28 x half> @bitcast_v28i16_to_v28f16_scalar(<28 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i16_to_v28f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_mov_b32_e32 v34, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_mov_b32_e32 v31, v6 +; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: v_mov_b32_e32 v29, v4 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v39, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v38 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v39 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v28i16_to_v28f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s26, 3 +; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s40, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s41, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s29, s41, s29 +; VI-NEXT: s_or_b32 s28, s40, s28 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s26, 0x30000 +; VI-NEXT: s_add_i32 s26, s24, 0x30000 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v28i16_to_v28f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28i16_to_v28f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_3: +; GFX11-NEXT: s_branch .LBB57_2 +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i16> %a, splat (i16 3) + %a2 = bitcast <28 x i16> %a1 to <28 x half> + br label %end + +cmp.false: + %a3 = bitcast <28 x i16> %a to <28 x half> + br label %end + +end: + %phi = phi <28 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x half> %phi +} + define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f16_to_v28i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v7 -; GCN-NEXT: v_or_b32_e32 v6, v6, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_or_b32_e32 v20, v20, v21 -; GCN-NEXT: v_or_b32_e32 v24, v24, v25 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f16_to_v28i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v22, v22, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_or_b32_e32 v18, v18, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v14, v14, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v28i16: ; VI: ; %bb.0: @@ -6216,7 +13741,7 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v15, 0x200 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v13 @@ -6261,7 +13786,7 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v17, v2 ; VI-NEXT: v_or_b32_e32 v1, v16, v1 ; VI-NEXT: v_or_b32_e32 v0, v14, v0 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6272,7 +13797,7 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] @@ -6289,7 +13814,7 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6301,7 +13826,7 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] @@ -6317,7 +13842,7 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB29_2: ; %end +; GFX11-NEXT: .LBB58_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6336,3 +13861,380 @@ end: %phi = phi <28 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <28 x i16> %phi } + +define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f16_to_v28i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v20, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v22, v22, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_or_b32_e32 v18, v18, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v14, v14, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v28f16_to_v28i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s5, s28, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s5, s29, 16 +; VI-NEXT: v_add_f16_e32 v1, s28, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_lshr_b32 s5, s27, 16 +; VI-NEXT: v_or_b32_e32 v12, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s27, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s26, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s25, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s24, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s23, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s22, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s21, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v3, s29, v0 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s20, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s19, 16 +; VI-NEXT: v_or_b32_e32 v13, v3, v4 +; VI-NEXT: v_or_b32_e32 v4, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s19, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_add_f16_e32 v1, s18, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v1, v2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_add_f16_sdwa v15, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v14, s16, v0 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s17, v0 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v14, v15 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v28f16_to_v28i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s29, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s28, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28f16_to_v28i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_3: +; GFX11-NEXT: s_branch .LBB59_2 +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x half> %a, splat (half 0xH0200) + %a2 = bitcast <28 x half> %a1 to <28 x i16> + br label %end + +cmp.false: + %a3 = bitcast <28 x half> %a to <28 x i16> + br label %end + +end: + %phi = phi <28 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i16> %phi +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FAKE16: {{.*}} +; GFX11-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll index b52128024fbc3..6ec9c1177c180 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll @@ -1,58 +1,58 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v3bf16_to_v3f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB0_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB0_4 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB0_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: .LBB0_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3bf16_to_v3f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB0_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB0_4 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB0_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: .LBB0_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3bf16_to_v3f16: ; VI: ; %bb.0: @@ -238,50 +238,306 @@ end: ret <3 x half> %phi } +define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3bf16_to_v3f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v3bf16_to_v3f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_4 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_3: +; VI-NEXT: s_branch .LBB1_2 +; VI-NEXT: .LBB1_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3bf16_to_v3f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_4 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fc0 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, s4, 16, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_3: +; GFX9-NEXT: s_branch .LBB1_2 +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v3bf16_to_v3f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB1_4 +; GFX11-TRUE16-NEXT: .LBB1_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v5, v8 :: v_dual_and_b32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB1_3: +; GFX11-TRUE16-NEXT: s_branch .LBB1_2 +; GFX11-TRUE16-NEXT: .LBB1_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v3bf16_to_v3f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB1_4 +; GFX11-FAKE16-NEXT: .LBB1_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v5, v8 :: v_dual_and_b32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB1_3: +; GFX11-FAKE16-NEXT: s_branch .LBB1_2 +; GFX11-FAKE16-NEXT: .LBB1_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <3 x bfloat> %a1 to <3 x half> + br label %end + +cmp.false: + %a3 = bitcast <3 x bfloat> %a to <3 x half> + br label %end + +end: + %phi = phi <3 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x half> %phi +} + define <3 x bfloat> @bitcast_v3f16_to_v3bf16(<3 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f16_to_v3bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB1_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB1_4 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB1_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: .LBB1_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f16_to_v3bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB2_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB2_4 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB2_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: .LBB2_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f16_to_v3bf16: ; VI: ; %bb.0: @@ -347,49 +603,162 @@ end: ret <3 x bfloat> %phi } +define inreg <3 x bfloat> @bitcast_v3f16_to_v3bf16_scalar(<3 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f16_to_v3bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s18 +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v3f16_to_v3bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_e32 v2, s16, v0 +; VI-NEXT: v_add_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_or_b32_e32 v1, 0x7e000000, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f16_to_v3bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f16_to_v3bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x half> %a, splat (half 0xH0200) + %a2 = bitcast <3 x half> %a1 to <3 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <3 x half> %a to <3 x bfloat> + br label %end + +end: + %phi = phi <3 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x bfloat> %phi +} + define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v3bf16_to_v3i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB2_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3bf16_to_v3i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_4 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB4_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: .LBB4_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3bf16_to_v3i16: ; VI: ; %bb.0: @@ -398,7 +767,7 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -429,7 +798,7 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -440,7 +809,7 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -468,7 +837,7 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s6 ; GFX9-NEXT: s_movk_i32 s6, 0x7fc0 ; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -480,7 +849,7 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -516,7 +885,7 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1 -; GFX11-TRUE16-NEXT: .LBB2_2: ; %end +; GFX11-TRUE16-NEXT: .LBB4_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -528,7 +897,7 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -558,7 +927,7 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, 0x7fc0, v1, 16 -; GFX11-FAKE16-NEXT: .LBB2_2: ; %end +; GFX11-FAKE16-NEXT: .LBB4_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -578,39 +947,284 @@ end: ret <3 x i16> %phi } +define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3bf16_to_v3i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v3bf16_to_v3i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_4 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_3: +; VI-NEXT: s_branch .LBB5_2 +; VI-NEXT: .LBB5_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3bf16_to_v3i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_4 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v0, v2, v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: s_movk_i32 s4, 0x7fc0 +; GFX9-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, s4, 16, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_3: +; GFX9-NEXT: s_branch .LBB5_2 +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v3bf16_to_v3i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB5_4 +; GFX11-TRUE16-NEXT: .LBB5_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v2, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB5_3: +; GFX11-TRUE16-NEXT: s_branch .LBB5_2 +; GFX11-TRUE16-NEXT: .LBB5_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v3bf16_to_v3i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB5_4 +; GFX11-FAKE16-NEXT: .LBB5_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v2, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB5_3: +; GFX11-FAKE16-NEXT: s_branch .LBB5_2 +; GFX11-FAKE16-NEXT: .LBB5_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <3 x bfloat> %a1 to <3 x i16> + br label %end + +cmp.false: + %a3 = bitcast <3 x bfloat> %a to <3 x i16> + br label %end + +end: + %phi = phi <3 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i16> %phi +} + define <3 x bfloat> @bitcast_v3i16_to_v3bf16(<3 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i16_to_v3bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i16_to_v3bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i16_to_v3bf16: ; VI: ; %bb.0: @@ -674,34 +1288,140 @@ end: ret <3 x bfloat> %phi } +define inreg <3 x bfloat> @bitcast_v3i16_to_v3bf16_scalar(<3 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i16_to_v3bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_lshl_b32 s8, s18, 16 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_lshl_b32 s5, s18, 16 +; SI-NEXT: s_and_b32 s7, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s4, 16 +; SI-NEXT: s_add_i32 s8, s5, 0x30000 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v3i16_to_v3bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v3i16_to_v3bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3i16_to_v3bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: s_branch .LBB7_2 +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i16> %a, splat (i16 3) + %a2 = bitcast <3 x i16> %a1 to <3 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <3 x i16> %a to <3 x bfloat> + br label %end + +end: + %phi = phi <3 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x bfloat> %phi +} + define <3 x i16> @bitcast_v3f16_to_v3i16(<3 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f16_to_v3i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f16_to_v3i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f16_to_v3i16: ; VI: ; %bb.0: @@ -767,44 +1487,151 @@ end: ret <3 x i16> %phi } +define inreg <3 x i16> @bitcast_v3f16_to_v3i16_scalar(<3 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f16_to_v3i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v3f16_to_v3i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_4 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_e32 v2, s16, v0 +; VI-NEXT: v_add_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_or_b32_e32 v1, 0x7e000000, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_3: +; VI-NEXT: s_branch .LBB9_2 +; VI-NEXT: .LBB9_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f16_to_v3i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_4 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_3: +; GFX9-NEXT: s_branch .LBB9_2 +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f16_to_v3i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_4 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: s_branch .LBB9_2 +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x half> %a, splat (half 0xH0200) + %a2 = bitcast <3 x half> %a1 to <3 x i16> + br label %end + +cmp.false: + %a3 = bitcast <3 x half> %a to <3 x i16> + br label %end + +end: + %phi = phi <3 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i16> %phi +} + define <3 x half> @bitcast_v3i16_to_v3f16(<3 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i16_to_v3f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v2 -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_4 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB5_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v6 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: .LBB5_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i16_to_v3f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i16_to_v3f16: ; VI: ; %bb.0: @@ -867,3 +1694,103 @@ end: %phi = phi <3 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <3 x half> %phi } + +define inreg <3 x half> @bitcast_v3i16_to_v3f16_scalar(<3 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i16_to_v3f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v3i16_to_v3f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v3i16_to_v3f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3i16_to_v3f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i16> %a, splat (i16 3) + %a2 = bitcast <3 x i16> %a1 to <3 x half> + br label %end + +cmp.false: + %a3 = bitcast <3 x i16> %a to <3 x half> + br label %end + +end: + %phi = phi <3 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x half> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 5f21bdc09a15d..7eaf481167b99 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -1,40 +1,40 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <16 x float> @bitcast_v16i32_to_v16f32(<16 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i32_to_v16f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i32_to_v16f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v16f32: ; VI: ; %bb.0: @@ -140,35 +140,233 @@ end: ret <16 x float> %phi } +define inreg <16 x float> @bitcast_v16i32_to_v16f32_scalar(<16 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i32_to_v16f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v16i32_to_v16f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v16i32_to_v16f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v16i32_to_v16f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f32_to_v16i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f32_to_v16i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v16i32: ; VI: ; %bb.0: @@ -177,7 +375,7 @@ define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -195,7 +393,7 @@ define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -206,7 +404,7 @@ define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -224,7 +422,7 @@ define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -236,7 +434,7 @@ define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 @@ -246,7 +444,7 @@ define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -266,35 +464,237 @@ end: ret <16 x i32> %phi } +define inreg <16 x i32> @bitcast_v16f32_to_v16i32_scalar(<16 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f32_to_v16i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v16f32_to_v16i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v16f32_to_v16i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v16f32_to_v16i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s26, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i32_to_v8i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i32_to_v8i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v8i64: ; VI: ; %bb.0: @@ -303,7 +703,7 @@ define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 @@ -321,7 +721,7 @@ define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -332,7 +732,7 @@ define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 ; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 @@ -350,7 +750,7 @@ define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -362,7 +762,7 @@ define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 ; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 @@ -380,7 +780,7 @@ define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -400,35 +800,233 @@ end: ret <8 x i64> %phi } +define inreg <8 x i64> @bitcast_v16i32_to_v8i64_scalar(<16 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i32_to_v8i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v16i32_to_v8i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v16i32_to_v8i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v16i32_to_v8i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i64_to_v16i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i64_to_v16i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v16i32: ; VI: ; %bb.0: @@ -437,7 +1035,7 @@ define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc @@ -455,7 +1053,7 @@ define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -466,7 +1064,7 @@ define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc @@ -484,7 +1082,7 @@ define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -496,7 +1094,7 @@ define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -518,7 +1116,7 @@ define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -538,35 +1136,233 @@ end: ret <16 x i32> %phi } +define inreg <16 x i32> @bitcast_v8i64_to_v16i32_scalar(<8 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i64_to_v16i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v8i64_to_v16i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v8i64_to_v16i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v8i64_to_v16i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_3 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB7_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: s_branch .LBB7_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i32_to_v8f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i32_to_v8f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v8f64: ; VI: ; %bb.0: @@ -575,7 +1371,7 @@ define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 @@ -593,7 +1389,7 @@ define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -604,7 +1400,7 @@ define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 ; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 @@ -622,7 +1418,7 @@ define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -634,7 +1430,7 @@ define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 ; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 @@ -652,7 +1448,7 @@ define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -672,27 +1468,225 @@ end: ret <8 x double> %phi } +define inreg <8 x double> @bitcast_v16i32_to_v8f64_scalar(<16 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i32_to_v8f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v16i32_to_v8f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v16i32_to_v8f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v16i32_to_v8f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f64_to_v16i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f64_to_v16i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v16i32: ; VI: ; %bb.0: @@ -701,7 +1695,7 @@ define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -711,7 +1705,7 @@ define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -722,7 +1716,7 @@ define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -732,7 +1726,7 @@ define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -744,7 +1738,7 @@ define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -754,7 +1748,7 @@ define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -774,102 +1768,272 @@ end: ret <16 x i32> %phi } +define inreg <16 x i32> @bitcast_v8f64_to_v16i32_scalar(<8 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f64_to_v16i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v8f64_to_v16i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v8f64_to_v16i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v8f64_to_v16i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], s[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i32_to_v32i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v30, v15 -; GCN-NEXT: v_mov_b32_e32 v28, v14 -; GCN-NEXT: v_mov_b32_e32 v26, v13 -; GCN-NEXT: v_mov_b32_e32 v24, v12 -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v32, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v16, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i32_to_v32i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: v_mov_b32_e32 v24, v12 +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v32i16: ; VI: ; %bb.0: @@ -878,7 +2042,7 @@ define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 @@ -896,7 +2060,7 @@ define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: .LBB12_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -907,7 +2071,7 @@ define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 ; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 @@ -925,7 +2089,7 @@ define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB6_2: ; %end +; GFX9-NEXT: .LBB12_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -937,7 +2101,7 @@ define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 ; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 @@ -955,7 +2119,7 @@ define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB6_2: ; %end +; GFX11-NEXT: .LBB12_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -975,183 +2139,434 @@ end: ret <32 x i16> %phi } +define inreg <32 x i16> @bitcast_v16i32_to_v32i16_scalar(<16 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i32_to_v32i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_mov_b32_e32 v30, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v26, s29 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v16i32_to_v32i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v16i32_to_v32i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_v16i32_to_v32i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i16_to_v16i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v12 -; GCN-NEXT: v_mov_b32_e32 v36, v10 -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; GCN-NEXT: v_or_b32_e32 v0, v0, v54 -; GCN-NEXT: v_or_b32_e32 v1, v1, v55 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v2, v2, v39 -; GCN-NEXT: v_or_b32_e32 v3, v3, v48 -; GCN-NEXT: v_or_b32_e32 v4, v4, v49 -; GCN-NEXT: v_or_b32_e32 v5, v5, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v51 -; GCN-NEXT: v_or_b32_e32 v7, v7, v52 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_or_b32_e32 v15, v15, v53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v55, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_or_b32_e32 v4, v49, v4 -; GCN-NEXT: v_or_b32_e32 v5, v50, v5 -; GCN-NEXT: v_or_b32_e32 v6, v51, v6 -; GCN-NEXT: v_or_b32_e32 v7, v52, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_or_b32_e32 v14, v29, v14 -; GCN-NEXT: v_or_b32_e32 v15, v53, v15 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i16_to_v16i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v36, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v55 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_or_b32_e32 v5, v5, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_or_b32_e32 v10, v10, v48 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_or_b32_e32 v13, v13, v21 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v53, v5 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_or_b32_e32 v7, v51, v7 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v48, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v16i32: ; VI: ; %bb.0: @@ -1160,7 +2575,7 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 3 ; VI-NEXT: v_add_u16_e32 v16, 3, v15 @@ -1211,7 +2626,7 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v16, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v16, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1222,7 +2637,7 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -1240,7 +2655,7 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_2: ; %end +; GFX9-NEXT: .LBB14_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1252,7 +2667,7 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -1270,7 +2685,7 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: .LBB14_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1290,248 +2705,605 @@ end: ret <16 x i32> %phi } -define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i32_to_v32f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v15 -; GCN-NEXT: v_mov_b32_e32 v34, v14 -; GCN-NEXT: v_mov_b32_e32 v35, v13 -; GCN-NEXT: v_mov_b32_e32 v36, v12 -; GCN-NEXT: v_mov_b32_e32 v37, v11 -; GCN-NEXT: v_mov_b32_e32 v38, v10 -; GCN-NEXT: v_mov_b32_e32 v39, v9 -; GCN-NEXT: v_mov_b32_e32 v48, v8 -; GCN-NEXT: v_mov_b32_e32 v49, v7 -; GCN-NEXT: v_mov_b32_e32 v50, v6 -; GCN-NEXT: v_mov_b32_e32 v51, v5 -; GCN-NEXT: v_mov_b32_e32 v52, v4 -; GCN-NEXT: v_mov_b32_e32 v53, v3 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v1 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v51 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v52 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v53 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v54 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i16_to_v16i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v26, v14 +; SI-NEXT: v_mov_b32_e32 v25, v12 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v32 +; SI-NEXT: v_or_b32_e32 v15, v0, v17 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB15_2 ; -; VI-LABEL: bitcast_v16i32_to_v32f16: +; VI-LABEL: bitcast_v32i16_to_v16i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s17, 3 +; VI-NEXT: s_and_b32 s10, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s18, 3 +; VI-NEXT: s_and_b32 s12, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s19, 3 +; VI-NEXT: s_and_b32 s14, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s20, 3 +; VI-NEXT: s_and_b32 s16, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s21, 3 +; VI-NEXT: s_and_b32 s18, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s22, 3 +; VI-NEXT: s_and_b32 s20, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s23, 3 +; VI-NEXT: s_and_b32 s22, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s24, 3 +; VI-NEXT: s_and_b32 s24, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s40, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s41, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s42, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s43, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s44, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s45, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s7, s45, s7 +; VI-NEXT: s_or_b32 s6, s44, s6 +; VI-NEXT: s_or_b32 s29, s43, s29 +; VI-NEXT: s_or_b32 s28, s42, s28 +; VI-NEXT: s_or_b32 s27, s41, s27 +; VI-NEXT: s_or_b32 s26, s40, s26 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_add_i32 s25, s24, 0x30000 +; VI-NEXT: s_add_i32 s24, s22, 0x30000 +; VI-NEXT: s_add_i32 s23, s20, 0x30000 +; VI-NEXT: s_add_i32 s22, s18, 0x30000 +; VI-NEXT: s_add_i32 s21, s16, 0x30000 +; VI-NEXT: s_add_i32 s20, s14, 0x30000 +; VI-NEXT: s_add_i32 s19, s12, 0x30000 +; VI-NEXT: s_add_i32 s18, s10, 0x30000 +; VI-NEXT: s_add_i32 s17, s8, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v32i16_to_v16i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-LABEL: bitcast_v32i16_to_v16i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + +define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v16i32_to_v32f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_mov_b32_e32 v39, v8 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v5 +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v52, v3 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v1 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i32_to_v32f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB16_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1542,7 +3314,7 @@ define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 ; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 @@ -1560,7 +3332,7 @@ define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB8_2: ; %end +; GFX9-NEXT: .LBB16_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1572,7 +3344,7 @@ define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 ; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 @@ -1590,7 +3362,7 @@ define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB8_2: ; %end +; GFX11-NEXT: .LBB16_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1610,269 +3382,583 @@ end: ret <32 x half> %phi } +define inreg <32 x half> @bitcast_v16i32_to_v32f16_scalar(<16 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i32_to_v32f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_lshr_b32 s44, s6, 16 +; SI-NEXT: s_lshr_b32 s45, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v16i32_to_v32f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v16i32_to_v32f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v16i32_to_v32f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f16_to_v16i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; GCN-NEXT: v_or_b32_e32 v0, v44, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v52, v2 -; GCN-NEXT: v_or_b32_e32 v3, v50, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v36, v6 -; GCN-NEXT: v_or_b32_e32 v7, v34, v7 -; GCN-NEXT: v_or_b32_e32 v8, v33, v8 -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: v_or_b32_e32 v10, v31, v10 -; GCN-NEXT: v_or_b32_e32 v11, v21, v11 -; GCN-NEXT: v_or_b32_e32 v12, v19, v12 -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: v_or_b32_e32 v14, v17, v14 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v26, v24 -; GCN-NEXT: v_or_b32_e32 v10, v28, v27 -; GCN-NEXT: v_or_b32_e32 v11, v21, v29 -; GCN-NEXT: v_or_b32_e32 v12, v19, v25 -; GCN-NEXT: v_or_b32_e32 v13, v18, v23 -; GCN-NEXT: v_or_b32_e32 v14, v17, v22 -; GCN-NEXT: v_or_b32_e32 v15, v16, v20 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f16_to_v16i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_or_b32_e32 v5, v51, v5 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 +; SI-NEXT: v_or_b32_e32 v8, v37, v8 +; SI-NEXT: v_or_b32_e32 v9, v35, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v31, v11 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v35 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v23 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f16_to_v16i32: ; VI: ; %bb.0: @@ -1881,7 +3967,7 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v16, 0x200 ; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1932,7 +4018,7 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v17 ; VI-NEXT: v_or_b32_e32 v0, v0, v16 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1943,7 +4029,7 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -1962,7 +4048,7 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: .LBB18_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1974,7 +4060,7 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -1992,7 +4078,7 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: .LBB18_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2012,170 +4098,570 @@ end: ret <16 x i32> %phi } +define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f16_to_v16i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v40, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_or_b32_e32 v9, v29, v9 +; SI-NEXT: v_or_b32_e32 v10, v27, v10 +; SI-NEXT: v_or_b32_e32 v11, v25, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v32f16_to_v16i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v16, 0x200 +; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v17 +; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v17 +; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v17 +; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v17 +; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v17 +; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v17 +; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v17 +; VI-NEXT: v_or_b32_e32 v0, v0, v16 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v32f16_to_v16i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-LABEL: bitcast_v32f16_to_v16i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i32_to_v32bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v55, v15 -; GCN-NEXT: v_mov_b32_e32 v54, v14 -; GCN-NEXT: v_mov_b32_e32 v53, v13 -; GCN-NEXT: v_mov_b32_e32 v52, v12 -; GCN-NEXT: v_mov_b32_e32 v51, v11 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v49, v9 -; GCN-NEXT: v_mov_b32_e32 v48, v8 -; GCN-NEXT: v_mov_b32_e32 v39, v7 -; GCN-NEXT: v_mov_b32_e32 v38, v6 -; GCN-NEXT: v_mov_b32_e32 v37, v5 -; GCN-NEXT: v_mov_b32_e32 v36, v4 -; GCN-NEXT: v_mov_b32_e32 v35, v3 -; GCN-NEXT: v_mov_b32_e32 v34, v2 -; GCN-NEXT: v_mov_b32_e32 v33, v1 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v55 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v54 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v51 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v49 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v38 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i32_to_v32bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v53, v13 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: v_mov_b32_e32 v51, v11 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v49, v9 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v39, v7 +; SI-NEXT: v_mov_b32_e32 v38, v6 +; SI-NEXT: v_mov_b32_e32 v37, v5 +; SI-NEXT: v_mov_b32_e32 v36, v4 +; SI-NEXT: v_mov_b32_e32 v35, v3 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v55 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v53 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v32bf16: ; VI: ; %bb.0: @@ -2184,7 +4670,7 @@ define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 @@ -2202,7 +4688,7 @@ define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2213,7 +4699,7 @@ define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 ; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 @@ -2231,7 +4717,7 @@ define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2243,7 +4729,7 @@ define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 ; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 @@ -2261,7 +4747,7 @@ define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2281,237 +4767,551 @@ end: ret <32 x bfloat> %phi } +define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i32_to_v32bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s78, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s79, v1 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s79, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s79, 16 +; SI-NEXT: s_and_b32 s8, s78, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s78, 16 +; SI-NEXT: s_and_b32 s10, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_and_b32 s12, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s28, 16 +; SI-NEXT: s_and_b32 s14, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s27, 16 +; SI-NEXT: s_and_b32 s40, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s26, 16 +; SI-NEXT: s_and_b32 s42, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s25, 16 +; SI-NEXT: s_and_b32 s44, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s24, 16 +; SI-NEXT: s_and_b32 s46, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s23, 16 +; SI-NEXT: s_and_b32 s56, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s22, 16 +; SI-NEXT: s_and_b32 s58, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s21, 16 +; SI-NEXT: s_and_b32 s60, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s20, 16 +; SI-NEXT: s_and_b32 s62, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s19, 16 +; SI-NEXT: s_and_b32 s72, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s73, s18, 16 +; SI-NEXT: s_and_b32 s74, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s17, 16 +; SI-NEXT: s_and_b32 s76, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s78, s78, 3 +; SI-NEXT: s_add_i32 s79, s79, 3 +; SI-NEXT: s_and_b32 s6, s79, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s79, 16 +; SI-NEXT: s_and_b32 s8, s78, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s78, 16 +; SI-NEXT: s_and_b32 s10, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_and_b32 s12, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s28, 16 +; SI-NEXT: s_and_b32 s14, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s27, 16 +; SI-NEXT: s_and_b32 s40, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s26, 16 +; SI-NEXT: s_and_b32 s42, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s25, 16 +; SI-NEXT: s_and_b32 s44, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s24, 16 +; SI-NEXT: s_and_b32 s46, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s23, 16 +; SI-NEXT: s_and_b32 s56, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s22, 16 +; SI-NEXT: s_and_b32 s58, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s21, 16 +; SI-NEXT: s_and_b32 s60, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s20, 16 +; SI-NEXT: s_and_b32 s62, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s19, 16 +; SI-NEXT: s_and_b32 s72, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s73, s18, 16 +; SI-NEXT: s_and_b32 s74, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s17, 16 +; SI-NEXT: s_and_b32 s76, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s16, 16 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s77 +; SI-NEXT: v_mov_b32_e32 v1, s76 +; SI-NEXT: v_mov_b32_e32 v2, s75 +; SI-NEXT: v_mov_b32_e32 v3, s74 +; SI-NEXT: v_mov_b32_e32 v4, s73 +; SI-NEXT: v_mov_b32_e32 v5, s72 +; SI-NEXT: v_mov_b32_e32 v6, s63 +; SI-NEXT: v_mov_b32_e32 v7, s62 +; SI-NEXT: v_mov_b32_e32 v8, s61 +; SI-NEXT: v_mov_b32_e32 v9, s60 +; SI-NEXT: v_mov_b32_e32 v10, s59 +; SI-NEXT: v_mov_b32_e32 v11, s58 +; SI-NEXT: v_mov_b32_e32 v12, s57 +; SI-NEXT: v_mov_b32_e32 v13, s56 +; SI-NEXT: v_mov_b32_e32 v14, s47 +; SI-NEXT: v_mov_b32_e32 v15, s46 +; SI-NEXT: v_mov_b32_e32 v16, s45 +; SI-NEXT: v_mov_b32_e32 v17, s44 +; SI-NEXT: v_mov_b32_e32 v18, s43 +; SI-NEXT: v_mov_b32_e32 v19, s42 +; SI-NEXT: v_mov_b32_e32 v20, s41 +; SI-NEXT: v_mov_b32_e32 v21, s40 +; SI-NEXT: v_mov_b32_e32 v22, s15 +; SI-NEXT: v_mov_b32_e32 v23, s14 +; SI-NEXT: v_mov_b32_e32 v24, s13 +; SI-NEXT: v_mov_b32_e32 v25, s12 +; SI-NEXT: v_mov_b32_e32 v26, s11 +; SI-NEXT: v_mov_b32_e32 v27, s10 +; SI-NEXT: v_mov_b32_e32 v28, s9 +; SI-NEXT: v_mov_b32_e32 v29, s8 +; SI-NEXT: v_mov_b32_e32 v30, s7 +; SI-NEXT: v_mov_b32_e32 v31, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v16i32_to_v32bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v16i32_to_v32bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v16i32_to_v32bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_3 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB21_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: s_branch .LBB21_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v32bf16_to_v16i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v46 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v42 -; GCN-NEXT: v_alignbit_b32 v0, v0, v45, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v43, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_alignbit_b32 v2, v2, v51, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v49, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v39, 16 -; GCN-NEXT: v_alignbit_b32 v5, v5, v37, 16 -; GCN-NEXT: v_alignbit_b32 v6, v6, v36, 16 -; GCN-NEXT: v_alignbit_b32 v7, v7, v34, 16 -; GCN-NEXT: v_alignbit_b32 v8, v8, v33, 16 -; GCN-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; GCN-NEXT: v_alignbit_b32 v10, v10, v31, 16 -; GCN-NEXT: v_alignbit_b32 v11, v11, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v12, v19, 16 -; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; GCN-NEXT: v_alignbit_b32 v14, v14, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB11_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v31 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v10, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v11, v29, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v25, v19, 16 -; GCN-NEXT: v_alignbit_b32 v13, v23, v18, 16 -; GCN-NEXT: v_alignbit_b32 v14, v22, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v20, v16, 16 -; GCN-NEXT: .LBB11_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32bf16_to_v16i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: .LBB22_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: .LBB22_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v16i32: ; VI: ; %bb.0: @@ -2520,7 +5320,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -2811,7 +5611,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2822,7 +5622,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -3066,7 +5866,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc ; GFX9-NEXT: v_perm_b32 v0, v0, v16, s7 -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3078,7 +5878,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -3352,7 +6152,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v22, v25, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v20, v0 -; GFX11-TRUE16-NEXT: .LBB11_2: ; %end +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3364,7 +6164,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v15 @@ -3642,7 +6442,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v21, v26, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB11_2: ; %end +; GFX11-FAKE16-NEXT: .LBB22_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3662,413 +6462,1657 @@ end: ret <16 x i32> %phi } +define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32bf16_to_v16i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v32bf16_to_v16i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s31, v1 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_branch .LBB23_5 +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: .LBB23_5: ; %end +; VI-NEXT: v_readlane_b32 s31, v19, 1 +; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v16i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s31, v1 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s31, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 +; GFX9-NEXT: s_branch .LBB23_5 +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: .LBB23_5: ; %end +; GFX9-NEXT: v_readlane_b32 s31, v20, 1 +; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32bf16_to_v16i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s2, s26, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_and_b32 s1, s25, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s3, s25, 16 +; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s24, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s23, 16 +; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_lshl_b32 s1, s22, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s1, s21, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s20, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v8, v7 +; GFX11-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v16, v4, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v4 +; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s1, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v0 +; GFX11-NEXT: s_and_b32 s1, s17, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v6, v4 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v17, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v19, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_lshl_b32 s1, s16, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo +; GFX11-NEXT: v_bfe_u32 v17, v18, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v16, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v17, v18 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v19, v6, v16 +; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v16 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_lshl_b32 s1, s15, 16 +; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v3, v17, 16, 1 +; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v17 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX11-NEXT: v_bfe_u32 v19, v4, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4 +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s13, 16 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_lshl_b32 s0, s12, 16 +; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v21 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v19, v21, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v23, v17, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v18 +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v17 +; GFX11-NEXT: v_bfe_u32 v24, v22, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v18, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v2, v16, 16, v21 +; GFX11-NEXT: v_lshl_or_b32 v0, v20, 16, v17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: s_branch .LBB23_2 +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i32_to_v64i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 -; GCN-NEXT: v_alignbit_b32 v24, v12, v11, 24 -; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 -; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v51, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GCN-NEXT: .LBB12_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 -; GCN-NEXT: v_alignbit_b32 v24, v12, v11, 24 -; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 -; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v51, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GCN-NEXT: .LBB12_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v60 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_or_b32_e32 v60, v1, v18 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v17 -; GCN-NEXT: v_or_b32_e32 v17, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v58 -; GCN-NEXT: v_or_b32_e32 v58, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v46 -; GCN-NEXT: v_or_b32_e32 v46, v1, v2 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v44 -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 24, v61 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v59 -; GCN-NEXT: v_or_b32_e32 v44, v1, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v40 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 -; GCN-NEXT: v_or_b32_e32 v7, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 24, v56 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v47 -; GCN-NEXT: v_or_b32_e32 v54, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v49 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v48 -; GCN-NEXT: v_or_b32_e32 v9, v4, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v42 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v55 -; GCN-NEXT: v_or_b32_e32 v48, v5, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v34 -; GCN-NEXT: v_or_b32_e32 v34, v6, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v52 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v50 -; GCN-NEXT: v_or_b32_e32 v11, v8, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 24, v30 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v28 -; GCN-NEXT: v_or_b32_e32 v13, v10, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v38 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v35 -; GCN-NEXT: v_or_b32_e32 v28, v12, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 24, v24 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v21 -; GCN-NEXT: v_or_b32_e32 v21, v14, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v31 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v29 -; GCN-NEXT: v_or_b32_e32 v29, v15, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v52, 0xff, v20 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GCN-NEXT: v_or_b32_e32 v31, v19, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v45 -; GCN-NEXT: v_or_b32_e32 v27, v61, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v19 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v41 -; GCN-NEXT: v_or_b32_e32 v40, v40, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 24, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v57 -; GCN-NEXT: v_or_b32_e32 v56, v56, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v60 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v46 -; GCN-NEXT: v_or_b32_e32 v49, v49, v47 -; GCN-NEXT: v_and_b32_e32 v44, 0xffff, v44 -; GCN-NEXT: v_or_b32_e32 v51, v51, v43 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v36, v36, v55 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v37, v37, v53 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v30, v30, v50 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v32, v32, v39 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GCN-NEXT: v_or_b32_e32 v24, v24, v35 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v25, v25, v38 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v20, v20, v52 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v26, v26, v42 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v33, v33, v45 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v23, v41, v23 -; GCN-NEXT: v_or_b32_e32 v31, v57, v31 -; GCN-NEXT: v_or_b32_e32 v17, v17, v27 -; GCN-NEXT: v_or_b32_e32 v27, v58, v40 -; GCN-NEXT: v_or_b32_e32 v35, v59, v56 -; GCN-NEXT: v_or_b32_e32 v38, v46, v49 -; GCN-NEXT: v_or_b32_e32 v39, v44, v51 -; GCN-NEXT: v_or_b32_e32 v7, v7, v36 -; GCN-NEXT: v_or_b32_e32 v36, v54, v37 -; GCN-NEXT: v_or_b32_e32 v9, v9, v30 -; GCN-NEXT: v_or_b32_e32 v30, v48, v32 -; GCN-NEXT: v_or_b32_e32 v24, v34, v24 -; GCN-NEXT: v_or_b32_e32 v11, v11, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v20 -; GCN-NEXT: v_or_b32_e32 v20, v28, v26 -; GCN-NEXT: v_or_b32_e32 v21, v21, v33 -; GCN-NEXT: v_or_b32_e32 v23, v29, v23 -; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i32_to_v64i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 8 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v24, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 8 +; SI-NEXT: v_alignbit_b32 v29, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v30, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v36, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v41, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v6 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; SI-NEXT: .LBB24_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 8 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v24, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 8 +; SI-NEXT: v_alignbit_b32 v29, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v30, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v36, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v41, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v6 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; SI-NEXT: .LBB24_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v46 +; SI-NEXT: v_or_b32_e32 v46, v46, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v46 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v17, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v64i8: ; VI: ; %bb.0: @@ -4142,7 +8186,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -4195,9 +8239,9 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] -; VI-NEXT: .LBB12_2: ; %Flow +; VI-NEXT: .LBB24_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_4 +; VI-NEXT: s_cbranch_execz .LBB24_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 ; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 @@ -4265,7 +8309,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: .LBB12_4: ; %end +; VI-NEXT: .LBB24_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 @@ -4473,7 +8517,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -4526,9 +8570,9 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: .LBB12_2: ; %Flow +; GFX9-NEXT: .LBB24_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_4 +; GFX9-NEXT: s_cbranch_execz .LBB24_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 ; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 @@ -4596,7 +8640,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB12_4: ; %end +; GFX9-NEXT: .LBB24_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 @@ -4756,7 +8800,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -4790,9 +8834,9 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB12_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB24_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 @@ -4842,7 +8886,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB12_4: ; %end +; GFX11-TRUE16-NEXT: .LBB24_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -5054,7 +9098,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -5104,9 +9148,9 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 @@ -5172,7 +9216,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB12_4: ; %end +; GFX11-FAKE16-NEXT: .LBB24_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -5342,606 +9386,2577 @@ end: ret <64 x i8> %phi } +define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i32_to_v64i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_alignbit_b32 v7, s27, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s27, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s27, v9, 8 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v21, s18 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s29, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s29, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s29, v6, 8 +; SI-NEXT: v_alignbit_b32 v13, s25, v9, 24 +; SI-NEXT: v_alignbit_b32 v15, s25, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s25, v9, 8 +; SI-NEXT: v_alignbit_b32 v11, s23, v14, 24 +; SI-NEXT: v_alignbit_b32 v12, s23, v14, 16 +; SI-NEXT: v_alignbit_b32 v14, s23, v14, 8 +; SI-NEXT: v_alignbit_b32 v16, s21, v18, 24 +; SI-NEXT: v_alignbit_b32 v17, s21, v18, 16 +; SI-NEXT: v_alignbit_b32 v18, s21, v18, 8 +; SI-NEXT: v_alignbit_b32 v19, s19, v21, 24 +; SI-NEXT: v_alignbit_b32 v20, s19, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s19, v21, 8 +; SI-NEXT: v_alignbit_b32 v23, s17, v22, 24 +; SI-NEXT: v_alignbit_b32 v24, s17, v22, 16 +; SI-NEXT: v_alignbit_b32 v22, s17, v22, 8 +; SI-NEXT: s_lshr_b32 s8, s6, 24 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 8 +; SI-NEXT: s_lshr_b32 s11, s29, 24 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s29, 8 +; SI-NEXT: s_lshr_b32 s14, s27, 24 +; SI-NEXT: s_lshr_b32 s15, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 8 +; SI-NEXT: s_lshr_b32 s41, s25, 24 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 8 +; SI-NEXT: s_lshr_b32 s44, s23, 24 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: s_lshr_b32 s47, s21, 24 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 8 +; SI-NEXT: s_lshr_b32 s58, s19, 24 +; SI-NEXT: s_lshr_b32 s59, s19, 16 +; SI-NEXT: s_lshr_b32 s60, s19, 8 +; SI-NEXT: s_lshr_b32 s61, s17, 24 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_alignbit_b32 v7, s27, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s27, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s27, v9, 8 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v21, s18 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s29, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s29, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s29, v6, 8 +; SI-NEXT: v_alignbit_b32 v13, s25, v9, 24 +; SI-NEXT: v_alignbit_b32 v15, s25, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s25, v9, 8 +; SI-NEXT: v_alignbit_b32 v11, s23, v14, 24 +; SI-NEXT: v_alignbit_b32 v12, s23, v14, 16 +; SI-NEXT: v_alignbit_b32 v14, s23, v14, 8 +; SI-NEXT: v_alignbit_b32 v16, s21, v18, 24 +; SI-NEXT: v_alignbit_b32 v17, s21, v18, 16 +; SI-NEXT: v_alignbit_b32 v18, s21, v18, 8 +; SI-NEXT: v_alignbit_b32 v19, s19, v21, 24 +; SI-NEXT: v_alignbit_b32 v20, s19, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s19, v21, 8 +; SI-NEXT: v_alignbit_b32 v23, s17, v22, 24 +; SI-NEXT: v_alignbit_b32 v24, s17, v22, 16 +; SI-NEXT: v_alignbit_b32 v22, s17, v22, 8 +; SI-NEXT: s_lshr_b32 s8, s6, 24 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 8 +; SI-NEXT: s_lshr_b32 s11, s29, 24 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s29, 8 +; SI-NEXT: s_lshr_b32 s14, s27, 24 +; SI-NEXT: s_lshr_b32 s15, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 8 +; SI-NEXT: s_lshr_b32 s41, s25, 24 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 8 +; SI-NEXT: s_lshr_b32 s44, s23, 24 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: s_lshr_b32 s47, s21, 24 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 8 +; SI-NEXT: s_lshr_b32 s58, s19, 24 +; SI-NEXT: s_lshr_b32 s59, s19, 16 +; SI-NEXT: s_lshr_b32 s60, s19, 8 +; SI-NEXT: s_lshr_b32 s61, s17, 24 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; SI-NEXT: v_or_b32_e32 v22, s4, v22 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s63, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s62, 0xff +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s61, 24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; SI-NEXT: v_or_b32_e32 v21, s4, v21 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s60, 8 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s59, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s58, 24 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v20, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_or_b32_e32 v18, s4, v18 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s57, 8 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s56, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s47, 24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s45, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s44, 24 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s5, s43, 8 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xff, v15 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s42, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s41, 24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: buffer_store_dword v11, v9, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v10 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s27, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s15, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s14, s14, 24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s13, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s12, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s11, s11, 24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s11, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s9, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s8, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB25_2 +; +; VI-LABEL: bitcast_v16i32_to_v64i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v4, s30, 0 +; VI-NEXT: v_writelane_b32 v4, s31, 1 +; VI-NEXT: v_writelane_b32 v4, s34, 2 +; VI-NEXT: v_writelane_b32 v4, s35, 3 +; VI-NEXT: v_writelane_b32 v4, s36, 4 +; VI-NEXT: v_writelane_b32 v4, s37, 5 +; VI-NEXT: v_writelane_b32 v4, s38, 6 +; VI-NEXT: v_writelane_b32 v4, s39, 7 +; VI-NEXT: v_writelane_b32 v4, s48, 8 +; VI-NEXT: v_writelane_b32 v4, s49, 9 +; VI-NEXT: v_writelane_b32 v4, s50, 10 +; VI-NEXT: v_writelane_b32 v4, s51, 11 +; VI-NEXT: v_writelane_b32 v4, s52, 12 +; VI-NEXT: v_writelane_b32 v4, s53, 13 +; VI-NEXT: v_writelane_b32 v4, s54, 14 +; VI-NEXT: v_writelane_b32 v4, s55, 15 +; VI-NEXT: v_writelane_b32 v4, s64, 16 +; VI-NEXT: v_writelane_b32 v4, s65, 17 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_writelane_b32 v4, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: v_writelane_b32 v4, s67, 19 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s59, s4, 16 +; VI-NEXT: s_lshr_b32 s60, s4, 8 +; VI-NEXT: s_lshr_b32 s61, s29, 24 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s29, 8 +; VI-NEXT: s_lshr_b32 s72, s28, 16 +; VI-NEXT: s_lshr_b32 s73, s28, 8 +; VI-NEXT: s_lshr_b32 s74, s27, 24 +; VI-NEXT: s_lshr_b32 s75, s27, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 8 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 8 +; VI-NEXT: s_lshr_b32 s79, s25, 24 +; VI-NEXT: s_lshr_b32 s88, s25, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 8 +; VI-NEXT: s_lshr_b32 s90, s24, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 8 +; VI-NEXT: s_lshr_b32 s30, s23, 24 +; VI-NEXT: s_lshr_b32 s31, s23, 16 +; VI-NEXT: s_lshr_b32 s34, s23, 8 +; VI-NEXT: s_lshr_b32 s35, s22, 16 +; VI-NEXT: s_lshr_b32 s36, s22, 8 +; VI-NEXT: s_lshr_b32 s37, s21, 24 +; VI-NEXT: s_lshr_b32 s38, s21, 16 +; VI-NEXT: s_lshr_b32 s39, s21, 8 +; VI-NEXT: s_lshr_b32 s48, s20, 16 +; VI-NEXT: s_lshr_b32 s49, s20, 8 +; VI-NEXT: s_lshr_b32 s50, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s55, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s65, s17, 8 +; VI-NEXT: s_lshr_b32 s66, s16, 16 +; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s5, s5, 3 +; VI-NEXT: s_add_i32 s4, s4, 3 +; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s59, s4, 16 +; VI-NEXT: s_lshr_b32 s60, s4, 8 +; VI-NEXT: s_lshr_b32 s61, s29, 24 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s29, 8 +; VI-NEXT: s_lshr_b32 s72, s28, 16 +; VI-NEXT: s_lshr_b32 s73, s28, 8 +; VI-NEXT: s_lshr_b32 s74, s27, 24 +; VI-NEXT: s_lshr_b32 s75, s27, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 8 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 8 +; VI-NEXT: s_lshr_b32 s79, s25, 24 +; VI-NEXT: s_lshr_b32 s88, s25, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 8 +; VI-NEXT: s_lshr_b32 s90, s24, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 8 +; VI-NEXT: s_lshr_b32 s30, s23, 24 +; VI-NEXT: s_lshr_b32 s31, s23, 16 +; VI-NEXT: s_lshr_b32 s34, s23, 8 +; VI-NEXT: s_lshr_b32 s35, s22, 16 +; VI-NEXT: s_lshr_b32 s36, s22, 8 +; VI-NEXT: s_lshr_b32 s37, s21, 24 +; VI-NEXT: s_lshr_b32 s38, s21, 16 +; VI-NEXT: s_lshr_b32 s39, s21, 8 +; VI-NEXT: s_lshr_b32 s48, s20, 16 +; VI-NEXT: s_lshr_b32 s49, s20, 8 +; VI-NEXT: s_lshr_b32 s50, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s55, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s65, s17, 8 +; VI-NEXT: s_lshr_b32 s66, s16, 16 +; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: s_and_b32 s7, s16, 0xff +; VI-NEXT: s_lshl_b32 s9, s67, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s66, 0xff +; VI-NEXT: s_lshl_b32 s11, s44, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_and_b32 s7, s17, 0xff +; VI-NEXT: s_lshl_b32 s9, s65, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s64, 0xff +; VI-NEXT: s_lshl_b32 s11, s55, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s54, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s53, 0xff +; VI-NEXT: s_lshl_b32 s11, s42, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s19, 0xff +; VI-NEXT: s_lshl_b32 s9, s52, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s51, 0xff +; VI-NEXT: s_lshl_b32 s11, s50, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s49, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s48, 0xff +; VI-NEXT: s_lshl_b32 s11, s40, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s21, 0xff +; VI-NEXT: s_lshl_b32 s9, s39, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s38, 0xff +; VI-NEXT: s_lshl_b32 s11, s37, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s9, s36, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s35, 0xff +; VI-NEXT: s_lshl_b32 s11, s14, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s23, 0xff +; VI-NEXT: s_lshl_b32 s9, s34, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s31, 0xff +; VI-NEXT: s_lshl_b32 s11, s30, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_lshl_b32 s9, s91, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s90, 0xff +; VI-NEXT: s_lshl_b32 s11, s12, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s25, 0xff +; VI-NEXT: s_lshl_b32 s9, s89, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s88, 0xff +; VI-NEXT: s_lshl_b32 s11, s79, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s9, s78, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s77, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s27, 0xff +; VI-NEXT: s_lshl_b32 s9, s76, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s75, 0xff +; VI-NEXT: s_lshl_b32 s10, s74, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s9, s73, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s72, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s29, 0xff +; VI-NEXT: s_lshl_b32 s8, s63, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s62, 0xff +; VI-NEXT: s_lshl_b32 s9, s61, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s7, s60, 8 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s59, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: s_lshl_b32 s5, s58, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s57, 0xff +; VI-NEXT: s_lshl_b32 s6, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s67, v4, 19 +; VI-NEXT: v_readlane_b32 s66, v4, 18 +; VI-NEXT: v_readlane_b32 s65, v4, 17 +; VI-NEXT: v_readlane_b32 s64, v4, 16 +; VI-NEXT: v_readlane_b32 s55, v4, 15 +; VI-NEXT: v_readlane_b32 s54, v4, 14 +; VI-NEXT: v_readlane_b32 s53, v4, 13 +; VI-NEXT: v_readlane_b32 s52, v4, 12 +; VI-NEXT: v_readlane_b32 s51, v4, 11 +; VI-NEXT: v_readlane_b32 s50, v4, 10 +; VI-NEXT: v_readlane_b32 s49, v4, 9 +; VI-NEXT: v_readlane_b32 s48, v4, 8 +; VI-NEXT: v_readlane_b32 s39, v4, 7 +; VI-NEXT: v_readlane_b32 s38, v4, 6 +; VI-NEXT: v_readlane_b32 s37, v4, 5 +; VI-NEXT: v_readlane_b32 s36, v4, 4 +; VI-NEXT: v_readlane_b32 s35, v4, 3 +; VI-NEXT: v_readlane_b32 s34, v4, 2 +; VI-NEXT: v_readlane_b32 s31, v4, 1 +; VI-NEXT: v_readlane_b32 s30, v4, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v16i32_to_v64i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v4, s30, 0 +; GFX9-NEXT: v_writelane_b32 v4, s31, 1 +; GFX9-NEXT: v_writelane_b32 v4, s34, 2 +; GFX9-NEXT: v_writelane_b32 v4, s35, 3 +; GFX9-NEXT: v_writelane_b32 v4, s36, 4 +; GFX9-NEXT: v_writelane_b32 v4, s37, 5 +; GFX9-NEXT: v_writelane_b32 v4, s38, 6 +; GFX9-NEXT: v_writelane_b32 v4, s39, 7 +; GFX9-NEXT: v_writelane_b32 v4, s48, 8 +; GFX9-NEXT: v_writelane_b32 v4, s49, 9 +; GFX9-NEXT: v_writelane_b32 v4, s50, 10 +; GFX9-NEXT: v_writelane_b32 v4, s51, 11 +; GFX9-NEXT: v_writelane_b32 v4, s52, 12 +; GFX9-NEXT: v_writelane_b32 v4, s53, 13 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_writelane_b32 v4, s54, 14 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: v_writelane_b32 v4, s55, 15 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s58, s5, 8 +; GFX9-NEXT: s_lshr_b32 s59, s4, 16 +; GFX9-NEXT: s_lshr_b32 s60, s4, 8 +; GFX9-NEXT: s_lshr_b32 s61, s29, 24 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s29, 8 +; GFX9-NEXT: s_lshr_b32 s72, s28, 16 +; GFX9-NEXT: s_lshr_b32 s73, s28, 8 +; GFX9-NEXT: s_lshr_b32 s74, s27, 24 +; GFX9-NEXT: s_lshr_b32 s75, s27, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 8 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s26, 8 +; GFX9-NEXT: s_lshr_b32 s79, s25, 24 +; GFX9-NEXT: s_lshr_b32 s88, s25, 16 +; GFX9-NEXT: s_lshr_b32 s89, s25, 8 +; GFX9-NEXT: s_lshr_b32 s90, s24, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 8 +; GFX9-NEXT: s_lshr_b32 s92, s23, 24 +; GFX9-NEXT: s_lshr_b32 s93, s23, 16 +; GFX9-NEXT: s_lshr_b32 s94, s23, 8 +; GFX9-NEXT: s_lshr_b32 s95, s22, 16 +; GFX9-NEXT: s_lshr_b32 s30, s22, 8 +; GFX9-NEXT: s_lshr_b32 s31, s21, 24 +; GFX9-NEXT: s_lshr_b32 s34, s21, 16 +; GFX9-NEXT: s_lshr_b32 s35, s21, 8 +; GFX9-NEXT: s_lshr_b32 s36, s20, 16 +; GFX9-NEXT: s_lshr_b32 s37, s20, 8 +; GFX9-NEXT: s_lshr_b32 s38, s19, 24 +; GFX9-NEXT: s_lshr_b32 s39, s19, 16 +; GFX9-NEXT: s_lshr_b32 s48, s19, 8 +; GFX9-NEXT: s_lshr_b32 s49, s18, 16 +; GFX9-NEXT: s_lshr_b32 s50, s18, 8 +; GFX9-NEXT: s_lshr_b32 s51, s17, 24 +; GFX9-NEXT: s_lshr_b32 s52, s17, 16 +; GFX9-NEXT: s_lshr_b32 s53, s17, 8 +; GFX9-NEXT: s_lshr_b32 s54, s16, 16 +; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s5, s5, 3 +; GFX9-NEXT: s_add_i32 s4, s4, 3 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s58, s5, 8 +; GFX9-NEXT: s_lshr_b32 s59, s4, 16 +; GFX9-NEXT: s_lshr_b32 s60, s4, 8 +; GFX9-NEXT: s_lshr_b32 s61, s29, 24 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s29, 8 +; GFX9-NEXT: s_lshr_b32 s72, s28, 16 +; GFX9-NEXT: s_lshr_b32 s73, s28, 8 +; GFX9-NEXT: s_lshr_b32 s74, s27, 24 +; GFX9-NEXT: s_lshr_b32 s75, s27, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 8 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s26, 8 +; GFX9-NEXT: s_lshr_b32 s79, s25, 24 +; GFX9-NEXT: s_lshr_b32 s88, s25, 16 +; GFX9-NEXT: s_lshr_b32 s89, s25, 8 +; GFX9-NEXT: s_lshr_b32 s90, s24, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 8 +; GFX9-NEXT: s_lshr_b32 s92, s23, 24 +; GFX9-NEXT: s_lshr_b32 s93, s23, 16 +; GFX9-NEXT: s_lshr_b32 s94, s23, 8 +; GFX9-NEXT: s_lshr_b32 s95, s22, 16 +; GFX9-NEXT: s_lshr_b32 s30, s22, 8 +; GFX9-NEXT: s_lshr_b32 s31, s21, 24 +; GFX9-NEXT: s_lshr_b32 s34, s21, 16 +; GFX9-NEXT: s_lshr_b32 s35, s21, 8 +; GFX9-NEXT: s_lshr_b32 s36, s20, 16 +; GFX9-NEXT: s_lshr_b32 s37, s20, 8 +; GFX9-NEXT: s_lshr_b32 s38, s19, 24 +; GFX9-NEXT: s_lshr_b32 s39, s19, 16 +; GFX9-NEXT: s_lshr_b32 s48, s19, 8 +; GFX9-NEXT: s_lshr_b32 s49, s18, 16 +; GFX9-NEXT: s_lshr_b32 s50, s18, 8 +; GFX9-NEXT: s_lshr_b32 s51, s17, 24 +; GFX9-NEXT: s_lshr_b32 s52, s17, 16 +; GFX9-NEXT: s_lshr_b32 s53, s17, 8 +; GFX9-NEXT: s_lshr_b32 s54, s16, 16 +; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: s_and_b32 s7, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s55, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s54, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s44, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s53, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s51, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s50, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s49, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s42, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s48, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s38, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s37, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s36, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s40, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s35, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s34, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s31, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s30, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s95, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s14, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s94, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s93, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s92, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s91, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s90, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s12, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s89, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s88, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s79, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s78, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s76, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s75, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s74, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s73, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s72, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s63, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s62, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s61, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s60, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s7 +; GFX9-NEXT: s_and_b32 s7, s59, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_and_b32 s4, s5, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s58, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s57, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s56, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_readlane_b32 s55, v4, 15 +; GFX9-NEXT: v_readlane_b32 s54, v4, 14 +; GFX9-NEXT: v_readlane_b32 s53, v4, 13 +; GFX9-NEXT: v_readlane_b32 s52, v4, 12 +; GFX9-NEXT: v_readlane_b32 s51, v4, 11 +; GFX9-NEXT: v_readlane_b32 s50, v4, 10 +; GFX9-NEXT: v_readlane_b32 s49, v4, 9 +; GFX9-NEXT: v_readlane_b32 s48, v4, 8 +; GFX9-NEXT: v_readlane_b32 s39, v4, 7 +; GFX9-NEXT: v_readlane_b32 s38, v4, 6 +; GFX9-NEXT: v_readlane_b32 s37, v4, 5 +; GFX9-NEXT: v_readlane_b32 s36, v4, 4 +; GFX9-NEXT: v_readlane_b32 s35, v4, 3 +; GFX9-NEXT: v_readlane_b32 s34, v4, 2 +; GFX9-NEXT: v_readlane_b32 s31, v4, 1 +; GFX9-NEXT: v_readlane_b32 s30, v4, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-TRUE16-LABEL: bitcast_v16i32_to_v64i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v17, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s30, 0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s49, 9 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB25_3 +; GFX11-TRUE16-NEXT: .LBB25_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 8 +; GFX11-TRUE16-NEXT: .LBB25_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s49 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s48 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s40 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s39 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s38 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s37 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s36 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s35 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s34 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s31 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s30 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s95 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s94 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s14 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s93 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s92 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s91 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s90 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s89 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s12 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s88 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s79 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s78 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s75 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s73 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s72 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s63 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s62 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s61 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s60 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s59 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s58 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s57 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s47 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s45 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s42 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v17, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v17, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v17, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v17, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v17, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v17, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v17, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v17, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v17, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v17, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v17, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB25_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr95_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB25_2 +; +; GFX11-FAKE16-LABEL: bitcast_v16i32_to_v64i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v17, s32 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s30, 0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s48, 8 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB25_3 +; GFX11-FAKE16-NEXT: .LBB25_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s0, 8 +; GFX11-FAKE16-NEXT: .LBB25_3: ; %end +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s48, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s39, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s40, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s38, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s37, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s36, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s35, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s34, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s28, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s31, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s30, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, vcc_hi, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s95, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s94, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s14, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s93, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s92, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s91, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s90, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s89, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s12, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s19, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s88, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s79, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s78, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s77, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s76, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s10, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s75, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s74, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s73, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s72, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s63, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s8, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s62, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s61, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s60, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s59, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s58, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s6, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s25, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s57, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s56, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s47, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s46, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s45, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s27, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s44, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s43, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s42, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v17, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v17, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v17, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v17, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v17, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v17, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v17, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v17, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v17, 0 +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v17, off, s32 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB25_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: s_branch .LBB25_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i8_to_v16i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v34, v12 -; GCN-NEXT: v_mov_b32_e32 v37, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v36, v6 -; GCN-NEXT: v_mov_b32_e32 v32, v4 -; GCN-NEXT: v_mov_b32_e32 v35, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v44 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 24, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 24, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 24, v46 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_or_b32_e32 v0, v0, v42 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_or_b32_e32 v1, v1, v41 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v40 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_or_b32_e32 v3, v3, v55 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v37 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v38 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v50 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v54 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v53 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v52 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v49 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v48 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v39 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v63 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v62 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v61 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32 -; GCN-NEXT: v_or_b32_e32 v17, v33, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v34 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v28, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; GCN-NEXT: v_or_b32_e32 v27, v35, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v29, v36, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v37 -; GCN-NEXT: v_or_b32_e32 v31, v38, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v18, v18, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v25, v59, v25 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v30, v57, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v32, v58, v8 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v9 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v8, v10 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v11 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v8, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v8, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v8, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v19, v46, v19 -; GCN-NEXT: v_or_b32_e32 v0, v0, v7 -; GCN-NEXT: v_or_b32_e32 v1, v1, v6 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: v_or_b32_e32 v4, v17, v21 -; GCN-NEXT: v_or_b32_e32 v5, v20, v22 -; GCN-NEXT: v_or_b32_e32 v6, v23, v24 -; GCN-NEXT: v_or_b32_e32 v7, v26, v28 -; GCN-NEXT: v_or_b32_e32 v8, v27, v25 -; GCN-NEXT: v_or_b32_e32 v9, v29, v30 -; GCN-NEXT: v_or_b32_e32 v10, v31, v32 -; GCN-NEXT: v_or_b32_e32 v11, v33, v34 -; GCN-NEXT: v_or_b32_e32 v12, v35, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v16 -; GCN-NEXT: v_or_b32_e32 v15, v18, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; kill: killed $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: .LBB13_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v41, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v40, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v55, v3 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v50 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v51 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GCN-NEXT: v_or_b32_e32 v6, v17, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v29, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v27, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GCN-NEXT: v_or_b32_e32 v17, v44, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; GCN-NEXT: v_or_b32_e32 v20, v45, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GCN-NEXT: v_or_b32_e32 v23, v56, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v30 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v32 -; GCN-NEXT: v_or_b32_e32 v29, v43, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v19, v47, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v34 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v31, v36 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_or_b32_e32 v16, v59, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 -; GCN-NEXT: v_or_b32_e32 v18, v57, v18 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: v_or_b32_e32 v22, v58, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x300, v19 -; GCN-NEXT: v_or_b32_e32 v30, v46, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v0, v31, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v10, v9 -; GCN-NEXT: v_or_b32_e32 v6, v12, v11 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v16, v15 -; GCN-NEXT: v_or_b32_e32 v9, v18, v17 -; GCN-NEXT: v_or_b32_e32 v10, v22, v20 -; GCN-NEXT: v_or_b32_e32 v11, v24, v23 -; GCN-NEXT: v_or_b32_e32 v12, v26, v25 -; GCN-NEXT: v_or_b32_e32 v13, v28, v27 -; GCN-NEXT: v_or_b32_e32 v14, v21, v29 -; GCN-NEXT: v_or_b32_e32 v15, v30, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 -; GCN-NEXT: .LBB13_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i8_to_v16i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v8 +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v50 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v41 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v45 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v56 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v13 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v53, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v23, v15 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v8, v8, v62 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v59 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: .LBB26_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v53, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v23, v15 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v17, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v30, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v40, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v13, v50, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v25, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v21, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 +; SI-NEXT: .LBB26_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v16i32: ; VI: ; %bb.0: @@ -6055,7 +12070,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_cbranch_execz .LBB26_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -6219,9 +12234,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: .LBB13_2: ; %Flow +; VI-NEXT: .LBB26_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_4 +; VI-NEXT: s_cbranch_execz .LBB26_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -6370,7 +12385,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: .LBB13_4: ; %end +; VI-NEXT: .LBB26_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -6515,7 +12530,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -6679,9 +12694,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr21 -; GFX9-NEXT: .LBB13_2: ; %Flow +; GFX9-NEXT: .LBB26_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_4 +; GFX9-NEXT: s_cbranch_execz .LBB26_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -6830,7 +12845,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: .LBB13_4: ; %end +; GFX9-NEXT: .LBB26_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -6957,15 +12972,15 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-TRUE16-NEXT: .LBB13_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_4 +; GFX11-TRUE16-NEXT: .LBB26_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB13_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h @@ -7147,8 +13162,8 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-TRUE16-NEXT: .LBB13_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 @@ -7427,15 +13442,15 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-FAKE16-NEXT: .LBB13_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_4 +; GFX11-FAKE16-NEXT: .LBB26_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB13_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v37 @@ -7614,8 +13629,8 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-FAKE16-NEXT: .LBB13_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -7812,35 +13827,2138 @@ end: ret <16 x i32> %phi } +define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i8_to_v16i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v51 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v30 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v42 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v44 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_mov_b32_e32 v61, v57 +; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v24, v26 +; SI-NEXT: v_mov_b32_e32 v26, v28 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v23, v21 +; SI-NEXT: v_mov_b32_e32 v21, v19 +; SI-NEXT: v_mov_b32_e32 v19, v17 +; SI-NEXT: v_mov_b32_e32 v17, v13 +; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v27, v42 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v27, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v24, v26 +; SI-NEXT: v_mov_b32_e32 v26, v28 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v61, v57 +; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v23, v21 +; SI-NEXT: v_mov_b32_e32 v21, v19 +; SI-NEXT: v_mov_b32_e32 v19, v17 +; SI-NEXT: v_mov_b32_e32 v17, v13 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v64i8_to_v16i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v48, v14 +; VI-NEXT: v_mov_b32_e32 v49, v13 +; VI-NEXT: v_mov_b32_e32 v50, v12 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v31, v8 +; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v27 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v2 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v6 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; VI-NEXT: v_or_b32_sdwa v0, v39, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v42, v7 +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v8 +; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v26, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v10 +; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v56 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v11 +; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v19, v52 +; VI-NEXT: v_mov_b32_e32 v52, v53 +; VI-NEXT: v_mov_b32_e32 v53, v12 +; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v23, v13 +; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v47, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_mov_b32_e32 v27, v3 +; VI-NEXT: v_or_b32_sdwa v2, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v48 +; VI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; VI-NEXT: v_or_b32_sdwa v17, v42, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 +; VI-NEXT: v_or_b32_e32 v7, v7, v16 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v7, v7, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v20 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 +; VI-NEXT: v_or_b32_sdwa v16, v41, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v24 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 +; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v40, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v36 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v28 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v51, vcc, 3, v57 +; VI-NEXT: v_lshlrev_b32_e32 v10, 24, v29 +; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v55, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v51 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 +; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v46 +; VI-NEXT: v_lshlrev_b32_e32 v11, 24, v56 +; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v54, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v39 +; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v38 +; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v62 +; VI-NEXT: v_lshlrev_b32_e32 v12, 24, v45 +; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v53, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v37 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v36 +; VI-NEXT: v_lshlrev_b32_e32 v13, 24, v61 +; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v23, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; VI-NEXT: v_lshlrev_b32_e32 v14, 24, v59 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s9, s17, 8 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s18, 0xff +; VI-NEXT: s_lshl_b32 s8, s19, 24 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s21, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s9, s20, 0xff +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s22, 0xff +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s6, s23, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s9 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s5, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v31 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v47 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v34 +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v63 +; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v49 +; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: v_or_b32_sdwa v31, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_or_b32_sdwa v25, v52, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x300, v31 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v16, v16, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v5, v5, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v25 +; VI-NEXT: v_or_b32_e32 v6, v6, v21 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s8, s8, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: v_mov_b32_e32 v19, v52 +; VI-NEXT: v_mov_b32_e32 v27, v3 +; VI-NEXT: v_mov_b32_e32 v52, v53 +; VI-NEXT: v_mov_b32_e32 v42, v7 +; VI-NEXT: v_mov_b32_e32 v41, v8 +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_mov_b32_e32 v55, v10 +; VI-NEXT: v_mov_b32_e32 v54, v11 +; VI-NEXT: v_mov_b32_e32 v53, v12 +; VI-NEXT: v_mov_b32_e32 v23, v13 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v64i8_to_v16i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, v30 +; GFX9-NEXT: v_mov_b32_e32 v61, v28 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v48 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v35 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v42 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v36, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v3 +; GFX9-NEXT: v_mov_b32_e32 v48, v8 +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v63, v59 +; GFX9-NEXT: v_mov_b32_e32 v59, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v47 +; GFX9-NEXT: v_mov_b32_e32 v47, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v34, v39 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v45, v25 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v24 +; GFX9-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-NEXT: v_mov_b32_e32 v26, v61 +; GFX9-NEXT: v_mov_b32_e32 v61, v23 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v29, v33 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v30, v37 +; GFX9-NEXT: v_mov_b32_e32 v37, v27 +; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s5, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s29, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v44 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v35 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s5, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v39 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v29, v33 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v8 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_mov_b32_e32 v16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v24 +; GFX9-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-NEXT: v_mov_b32_e32 v26, v61 +; GFX9-NEXT: v_mov_b32_e32 v30, v37 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_mov_b32_e32 v40, v3 +; GFX9-NEXT: v_mov_b32_e32 v63, v59 +; GFX9-NEXT: v_mov_b32_e32 v36, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_mov_b32_e32 v59, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v47 +; GFX9-NEXT: v_mov_b32_e32 v47, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v25 +; GFX9-NEXT: v_mov_b32_e32 v61, v23 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_mov_b32_e32 v37, v27 +; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v16i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v96, v97 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v98, v99 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v96, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v86 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB27_3 +; GFX11-TRUE16-NEXT: .LBB27_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v21, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB27_3: ; %end +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB27_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB27_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v16i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v38 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v85 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v69 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v96, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v96, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB27_3 +; GFX11-FAKE16-NEXT: .LBB27_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v32 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v38 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v84, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v85, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v82, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v68, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v69, v11 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v70, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v80, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v65, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v67, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v64, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v49 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v27, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v29, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v54, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v55, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v17, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v19, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v21, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v23, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB27_3: ; %end +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB27_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB27_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f32_to_v8i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB14_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f32_to_v8i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB28_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v8i64: ; VI: ; %bb.0: @@ -7849,7 +15967,7 @@ define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -7867,7 +15985,7 @@ define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB14_2: ; %end +; VI-NEXT: .LBB28_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7878,7 +15996,7 @@ define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -7896,7 +16014,7 @@ define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB14_2: ; %end +; GFX9-NEXT: .LBB28_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7908,7 +16026,7 @@ define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 @@ -7918,7 +16036,7 @@ define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB14_2: ; %end +; GFX11-NEXT: .LBB28_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7938,35 +16056,237 @@ end: ret <8 x i64> %phi } +define inreg <8 x i64> @bitcast_v16f32_to_v8i64_scalar(<16 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f32_to_v8i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_v16f32_to_v8i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB29_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: s_branch .LBB29_2 +; +; GFX9-LABEL: bitcast_v16f32_to_v8i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: s_branch .LBB29_2 +; +; GFX11-LABEL: bitcast_v16f32_to_v8i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s26, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i64_to_v16f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i64_to_v16f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB30_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v16f32: ; VI: ; %bb.0: @@ -7975,7 +16295,7 @@ define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc @@ -7993,7 +16313,7 @@ define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB15_2: ; %end +; VI-NEXT: .LBB30_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8004,7 +16324,7 @@ define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc @@ -8022,7 +16342,7 @@ define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB15_2: ; %end +; GFX9-NEXT: .LBB30_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8034,7 +16354,7 @@ define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -8056,7 +16376,7 @@ define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB15_2: ; %end +; GFX11-NEXT: .LBB30_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8076,35 +16396,233 @@ end: ret <16 x float> %phi } +define inreg <16 x float> @bitcast_v8i64_to_v16f32_scalar(<8 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i64_to_v16f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v8i64_to_v16f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v8i64_to_v16f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-LABEL: bitcast_v8i64_to_v16f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB31_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: s_branch .LBB31_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f32_to_v8f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f32_to_v8f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB32_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v8f64: ; VI: ; %bb.0: @@ -8113,7 +16631,7 @@ define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -8131,7 +16649,7 @@ define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB16_2: ; %end +; VI-NEXT: .LBB32_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8142,7 +16660,7 @@ define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -8160,7 +16678,7 @@ define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB16_2: ; %end +; GFX9-NEXT: .LBB32_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8172,7 +16690,7 @@ define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 @@ -8182,7 +16700,7 @@ define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB16_2: ; %end +; GFX11-NEXT: .LBB32_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8202,27 +16720,229 @@ end: ret <8 x double> %phi } +define inreg <8 x double> @bitcast_v16f32_to_v8f64_scalar(<16 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f32_to_v8f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v16f32_to_v8f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB33_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: s_branch .LBB33_2 +; +; GFX9-LABEL: bitcast_v16f32_to_v8f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: s_branch .LBB33_2 +; +; GFX11-LABEL: bitcast_v16f32_to_v8f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s26, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f64_to_v16f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f64_to_v16f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB34_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v16f32: ; VI: ; %bb.0: @@ -8231,7 +16951,7 @@ define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -8241,7 +16961,7 @@ define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8252,7 +16972,7 @@ define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -8262,7 +16982,7 @@ define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: .LBB34_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8274,7 +16994,7 @@ define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -8284,7 +17004,7 @@ define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: .LBB34_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8304,102 +17024,272 @@ end: ret <16 x float> %phi } +define inreg <16 x float> @bitcast_v8f64_to_v16f32_scalar(<8 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f64_to_v16f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v8f64_to_v16f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v8f64_to_v16f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-LABEL: bitcast_v8f64_to_v16f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], s[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f32_to_v32i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v30, v15 -; GCN-NEXT: v_mov_b32_e32 v28, v14 -; GCN-NEXT: v_mov_b32_e32 v26, v13 -; GCN-NEXT: v_mov_b32_e32 v24, v12 -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v32, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB18_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB18_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v16, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f32_to_v32i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: v_mov_b32_e32 v24, v12 +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB36_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB36_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v32i16: ; VI: ; %bb.0: @@ -8408,7 +17298,7 @@ define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: s_cbranch_execz .LBB36_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -8426,7 +17316,7 @@ define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB18_2: ; %end +; VI-NEXT: .LBB36_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8437,7 +17327,7 @@ define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: s_cbranch_execz .LBB36_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -8455,7 +17345,7 @@ define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB18_2: ; %end +; GFX9-NEXT: .LBB36_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8467,7 +17357,7 @@ define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 @@ -8477,7 +17367,7 @@ define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB18_2: ; %end +; GFX11-NEXT: .LBB36_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8497,183 +17387,438 @@ end: ret <32 x i16> %phi } +define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f32_to_v32i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_mov_b32_e32 v30, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v26, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB37_2 +; +; VI-LABEL: bitcast_v16f32_to_v32i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB37_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 +; +; GFX9-LABEL: bitcast_v16f32_to_v32i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB37_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 +; +; GFX11-LABEL: bitcast_v16f32_to_v32i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s26, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: s_branch .LBB37_2 +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i16_to_v16f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v12 -; GCN-NEXT: v_mov_b32_e32 v36, v10 -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_4 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB19_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; GCN-NEXT: v_or_b32_e32 v0, v0, v54 -; GCN-NEXT: v_or_b32_e32 v1, v1, v55 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v2, v2, v39 -; GCN-NEXT: v_or_b32_e32 v3, v3, v48 -; GCN-NEXT: v_or_b32_e32 v4, v4, v49 -; GCN-NEXT: v_or_b32_e32 v5, v5, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v51 -; GCN-NEXT: v_or_b32_e32 v7, v7, v52 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_or_b32_e32 v15, v15, v53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: .LBB19_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v55, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_or_b32_e32 v4, v49, v4 -; GCN-NEXT: v_or_b32_e32 v5, v50, v5 -; GCN-NEXT: v_or_b32_e32 v6, v51, v6 -; GCN-NEXT: v_or_b32_e32 v7, v52, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_or_b32_e32 v14, v29, v14 -; GCN-NEXT: v_or_b32_e32 v15, v53, v15 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i16_to_v16f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v36, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v55 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_or_b32_e32 v5, v5, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_or_b32_e32 v10, v10, v48 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_or_b32_e32 v13, v13, v21 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: .LBB38_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v53, v5 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_or_b32_e32 v7, v51, v7 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v48, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 +; SI-NEXT: .LBB38_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v16f32: ; VI: ; %bb.0: @@ -8682,7 +17827,7 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 3 ; VI-NEXT: v_add_u16_e32 v16, 3, v15 @@ -8733,7 +17878,7 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v16, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v16, v0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8744,7 +17889,7 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -8762,7 +17907,7 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB19_2: ; %end +; GFX9-NEXT: .LBB38_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8774,7 +17919,7 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -8792,7 +17937,7 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8812,221 +17957,578 @@ end: ret <16 x float> %phi } +define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i16_to_v16f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v26, v14 +; SI-NEXT: v_mov_b32_e32 v25, v12 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v32 +; SI-NEXT: v_or_b32_e32 v15, v0, v17 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v32i16_to_v16f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s17, 3 +; VI-NEXT: s_and_b32 s10, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s18, 3 +; VI-NEXT: s_and_b32 s12, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s19, 3 +; VI-NEXT: s_and_b32 s14, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s20, 3 +; VI-NEXT: s_and_b32 s16, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s21, 3 +; VI-NEXT: s_and_b32 s18, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s22, 3 +; VI-NEXT: s_and_b32 s20, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s23, 3 +; VI-NEXT: s_and_b32 s22, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s24, 3 +; VI-NEXT: s_and_b32 s24, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s40, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s41, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s42, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s43, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s44, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s45, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s7, s45, s7 +; VI-NEXT: s_or_b32 s6, s44, s6 +; VI-NEXT: s_or_b32 s29, s43, s29 +; VI-NEXT: s_or_b32 s28, s42, s28 +; VI-NEXT: s_or_b32 s27, s41, s27 +; VI-NEXT: s_or_b32 s26, s40, s26 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_add_i32 s25, s24, 0x30000 +; VI-NEXT: s_add_i32 s24, s22, 0x30000 +; VI-NEXT: s_add_i32 s23, s20, 0x30000 +; VI-NEXT: s_add_i32 s22, s18, 0x30000 +; VI-NEXT: s_add_i32 s21, s16, 0x30000 +; VI-NEXT: s_add_i32 s20, s14, 0x30000 +; VI-NEXT: s_add_i32 s19, s12, 0x30000 +; VI-NEXT: s_add_i32 s18, s10, 0x30000 +; VI-NEXT: s_add_i32 s17, s8, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v32i16_to_v16f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v32i16_to_v16f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: s_branch .LBB39_2 +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f32_to_v32f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v15 -; GCN-NEXT: v_mov_b32_e32 v34, v14 -; GCN-NEXT: v_mov_b32_e32 v35, v13 -; GCN-NEXT: v_mov_b32_e32 v36, v12 -; GCN-NEXT: v_mov_b32_e32 v37, v11 -; GCN-NEXT: v_mov_b32_e32 v38, v10 -; GCN-NEXT: v_mov_b32_e32 v39, v9 -; GCN-NEXT: v_mov_b32_e32 v48, v8 -; GCN-NEXT: v_mov_b32_e32 v49, v7 -; GCN-NEXT: v_mov_b32_e32 v50, v6 -; GCN-NEXT: v_mov_b32_e32 v51, v5 -; GCN-NEXT: v_mov_b32_e32 v52, v4 -; GCN-NEXT: v_mov_b32_e32 v53, v3 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v1 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v51 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v52 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v53 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v54 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v32 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v55 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v54 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v53 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v52 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v51 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v50 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v49 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v48 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v39 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v38 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v37 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v36 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v35 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v34 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f32_to_v32f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_mov_b32_e32 v39, v8 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v5 +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v52, v3 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v1 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB40_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB40_4 +; SI-NEXT: .LBB40_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB40_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: .LBB40_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v55 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v54 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v53 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v52 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v51 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v50 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v38 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v37 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v34 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v32f16: ; VI: ; %bb.0: @@ -9035,7 +18537,7 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -9053,7 +18555,7 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: .LBB40_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9064,7 +18566,7 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -9082,7 +18584,7 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: .LBB40_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9094,7 +18596,7 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 @@ -9104,7 +18606,7 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: .LBB40_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9124,269 +18626,587 @@ end: ret <32 x half> %phi } +define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f32_to_v32f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v30, s7, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v16f32_to_v32f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v16f32_to_v32f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-LABEL: bitcast_v16f32_to_v32f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_4 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s26, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_3: +; GFX11-NEXT: s_branch .LBB41_2 +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f16_to_v16f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; GCN-NEXT: v_or_b32_e32 v0, v44, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v52, v2 -; GCN-NEXT: v_or_b32_e32 v3, v50, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v36, v6 -; GCN-NEXT: v_or_b32_e32 v7, v34, v7 -; GCN-NEXT: v_or_b32_e32 v8, v33, v8 -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: v_or_b32_e32 v10, v31, v10 -; GCN-NEXT: v_or_b32_e32 v11, v21, v11 -; GCN-NEXT: v_or_b32_e32 v12, v19, v12 -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: v_or_b32_e32 v14, v17, v14 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v26, v24 -; GCN-NEXT: v_or_b32_e32 v10, v28, v27 -; GCN-NEXT: v_or_b32_e32 v11, v21, v29 -; GCN-NEXT: v_or_b32_e32 v12, v19, v25 -; GCN-NEXT: v_or_b32_e32 v13, v18, v23 -; GCN-NEXT: v_or_b32_e32 v14, v17, v22 -; GCN-NEXT: v_or_b32_e32 v15, v16, v20 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f16_to_v16f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_or_b32_e32 v5, v51, v5 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 +; SI-NEXT: v_or_b32_e32 v8, v37, v8 +; SI-NEXT: v_or_b32_e32 v9, v35, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v31, v11 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v35 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v23 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f16_to_v16f32: ; VI: ; %bb.0: @@ -9395,7 +19215,7 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v16, 0x200 ; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -9446,7 +19266,7 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v17 ; VI-NEXT: v_or_b32_e32 v0, v0, v16 -; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: .LBB42_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9457,7 +19277,7 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -9476,7 +19296,7 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_2: ; %end +; GFX9-NEXT: .LBB42_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9488,7 +19308,7 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -9506,7 +19326,7 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB21_2: ; %end +; GFX11-NEXT: .LBB42_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9526,170 +19346,570 @@ end: ret <16 x float> %phi } +define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f16_to_v16f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v40, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_or_b32_e32 v9, v29, v9 +; SI-NEXT: v_or_b32_e32 v10, v27, v10 +; SI-NEXT: v_or_b32_e32 v11, v25, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v32f16_to_v16f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v16, 0x200 +; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v17 +; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v17 +; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v17 +; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v17 +; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v17 +; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v17 +; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v17 +; VI-NEXT: v_or_b32_e32 v0, v0, v16 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v32f16_to_v16f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-LABEL: bitcast_v32f16_to_v16f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: +; GFX11-NEXT: s_branch .LBB43_2 +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f32_to_v32bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v55, v15 -; GCN-NEXT: v_mov_b32_e32 v54, v14 -; GCN-NEXT: v_mov_b32_e32 v53, v13 -; GCN-NEXT: v_mov_b32_e32 v52, v12 -; GCN-NEXT: v_mov_b32_e32 v51, v11 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v49, v9 -; GCN-NEXT: v_mov_b32_e32 v48, v8 -; GCN-NEXT: v_mov_b32_e32 v39, v7 -; GCN-NEXT: v_mov_b32_e32 v38, v6 -; GCN-NEXT: v_mov_b32_e32 v37, v5 -; GCN-NEXT: v_mov_b32_e32 v36, v4 -; GCN-NEXT: v_mov_b32_e32 v35, v3 -; GCN-NEXT: v_mov_b32_e32 v34, v2 -; GCN-NEXT: v_mov_b32_e32 v33, v1 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_4 -; GCN-NEXT: .LBB22_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB22_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v55 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v54 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v51 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v49 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v38 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: .LBB22_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v32 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v33 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v34 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v35 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v36 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v37 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v38 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v39 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v48 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v49 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v50 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v51 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v52 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v53 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v54 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v55 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f32_to_v32bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v53, v13 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: v_mov_b32_e32 v51, v11 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v49, v9 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v39, v7 +; SI-NEXT: v_mov_b32_e32 v38, v6 +; SI-NEXT: v_mov_b32_e32 v37, v5 +; SI-NEXT: v_mov_b32_e32 v36, v4 +; SI-NEXT: v_mov_b32_e32 v35, v3 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_4 +; SI-NEXT: .LBB44_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB44_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v55 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v53 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: .LBB44_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v34 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v37 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v38 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v50 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v51 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v52 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v53 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v54 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v55 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v32bf16: ; VI: ; %bb.0: @@ -9698,7 +19918,7 @@ define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -9716,7 +19936,7 @@ define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: .LBB44_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9727,7 +19947,7 @@ define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -9745,7 +19965,7 @@ define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: .LBB44_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9757,7 +19977,7 @@ define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 @@ -9767,7 +19987,7 @@ define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: .LBB44_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9787,237 +20007,537 @@ end: ret <32 x bfloat> %phi } +define inreg <32 x bfloat> @bitcast_v16f32_to_v32bf16_scalar(<16 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f32_to_v32bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v34, s16 +; SI-NEXT: v_mov_b32_e32 v35, s17 +; SI-NEXT: v_mov_b32_e32 v36, s18 +; SI-NEXT: v_mov_b32_e32 v37, s19 +; SI-NEXT: v_mov_b32_e32 v38, s20 +; SI-NEXT: v_mov_b32_e32 v39, s21 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v49, s23 +; SI-NEXT: v_mov_b32_e32 v50, s24 +; SI-NEXT: v_mov_b32_e32 v51, s25 +; SI-NEXT: v_mov_b32_e32 v52, s26 +; SI-NEXT: v_mov_b32_e32 v53, s27 +; SI-NEXT: v_mov_b32_e32 v54, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v55, s29 +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v55 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v55 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v54 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v54 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v53 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v52 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v52 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v51 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v48 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v38 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v37 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v34 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v34 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v33, 1.0, v34 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v37 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v38 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v50 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v51 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v52 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v53 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v54 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v55 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v33 +; SI-NEXT: v_mov_b32_e32 v1, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v16f32_to_v32bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v16f32_to_v32bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-LABEL: bitcast_v16f32_to_v32bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_4 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s26, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_3: +; GFX11-NEXT: s_branch .LBB45_2 +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v32bf16_to_v16f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v46 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v42 -; GCN-NEXT: v_alignbit_b32 v0, v0, v45, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v43, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_alignbit_b32 v2, v2, v51, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v49, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v39, 16 -; GCN-NEXT: v_alignbit_b32 v5, v5, v37, 16 -; GCN-NEXT: v_alignbit_b32 v6, v6, v36, 16 -; GCN-NEXT: v_alignbit_b32 v7, v7, v34, 16 -; GCN-NEXT: v_alignbit_b32 v8, v8, v33, 16 -; GCN-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; GCN-NEXT: v_alignbit_b32 v10, v10, v31, 16 -; GCN-NEXT: v_alignbit_b32 v11, v11, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v12, v19, 16 -; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; GCN-NEXT: v_alignbit_b32 v14, v14, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v31 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v10, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v11, v29, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v25, v19, 16 -; GCN-NEXT: v_alignbit_b32 v13, v23, v18, 16 -; GCN-NEXT: v_alignbit_b32 v14, v22, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v20, v16, 16 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32bf16_to_v16f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v16f32: ; VI: ; %bb.0: @@ -10026,7 +20546,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -10317,7 +20837,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -10328,7 +20848,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -10572,7 +21092,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc ; GFX9-NEXT: v_perm_b32 v0, v0, v16, s7 -; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: .LBB46_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10584,7 +21104,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -10858,7 +21378,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v22, v25, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v20, v0 -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10870,7 +21390,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v15 @@ -11148,7 +21668,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v21, v26, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -11168,413 +21688,1657 @@ end: ret <16 x float> %phi } +define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32bf16_to_v16f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v32bf16_to_v16f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s31, v1 +; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_branch .LBB47_5 +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: .LBB47_5: ; %end +; VI-NEXT: v_readlane_b32 s31, v19, 1 +; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v16f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s31, v1 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s31, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 +; GFX9-NEXT: s_branch .LBB47_5 +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: .LBB47_5: ; %end +; GFX9-NEXT: v_readlane_b32 s31, v20, 1 +; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32bf16_to_v16f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s2, s26, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_and_b32 s1, s25, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s3, s25, 16 +; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s24, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s23, 16 +; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_lshl_b32 s1, s22, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s1, s21, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s20, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v8, v7 +; GFX11-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v16, v4, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v4 +; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s1, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v0 +; GFX11-NEXT: s_and_b32 s1, s17, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v6, v4 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v17, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v19, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_lshl_b32 s1, s16, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo +; GFX11-NEXT: v_bfe_u32 v17, v18, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v16, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v17, v18 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v19, v6, v16 +; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v16 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_lshl_b32 s1, s15, 16 +; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v3, v17, 16, 1 +; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v17 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX11-NEXT: v_bfe_u32 v19, v4, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4 +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s13, 16 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_lshl_b32 s0, s12, 16 +; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v21 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v19, v21, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v23, v17, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v18 +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v17 +; GFX11-NEXT: v_bfe_u32 v24, v22, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v18, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v2, v16, 16, v21 +; GFX11-NEXT: v_lshl_or_b32 v0, v20, 16, v17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_3: +; GFX11-NEXT: s_branch .LBB47_2 +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f32_to_v64i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 -; GCN-NEXT: v_alignbit_b32 v24, v12, v11, 24 -; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 -; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v51, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 -; GCN-NEXT: v_alignbit_b32 v24, v12, v11, 24 -; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 -; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v51, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v60 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_or_b32_e32 v60, v1, v18 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v17 -; GCN-NEXT: v_or_b32_e32 v17, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v58 -; GCN-NEXT: v_or_b32_e32 v58, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v46 -; GCN-NEXT: v_or_b32_e32 v46, v1, v2 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v44 -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 24, v61 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v59 -; GCN-NEXT: v_or_b32_e32 v44, v1, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v40 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 -; GCN-NEXT: v_or_b32_e32 v7, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 24, v56 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v47 -; GCN-NEXT: v_or_b32_e32 v54, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v49 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v48 -; GCN-NEXT: v_or_b32_e32 v9, v4, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v42 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v55 -; GCN-NEXT: v_or_b32_e32 v48, v5, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v34 -; GCN-NEXT: v_or_b32_e32 v34, v6, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v52 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v50 -; GCN-NEXT: v_or_b32_e32 v11, v8, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 24, v30 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v28 -; GCN-NEXT: v_or_b32_e32 v13, v10, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v38 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v35 -; GCN-NEXT: v_or_b32_e32 v28, v12, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 24, v24 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v21 -; GCN-NEXT: v_or_b32_e32 v21, v14, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v31 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v29 -; GCN-NEXT: v_or_b32_e32 v29, v15, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v52, 0xff, v20 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GCN-NEXT: v_or_b32_e32 v31, v19, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v45 -; GCN-NEXT: v_or_b32_e32 v27, v61, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v19 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v41 -; GCN-NEXT: v_or_b32_e32 v40, v40, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 24, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v57 -; GCN-NEXT: v_or_b32_e32 v56, v56, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v60 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v46 -; GCN-NEXT: v_or_b32_e32 v49, v49, v47 -; GCN-NEXT: v_and_b32_e32 v44, 0xffff, v44 -; GCN-NEXT: v_or_b32_e32 v51, v51, v43 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v36, v36, v55 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v37, v37, v53 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v30, v30, v50 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v32, v32, v39 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GCN-NEXT: v_or_b32_e32 v24, v24, v35 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v25, v25, v38 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v20, v20, v52 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v26, v26, v42 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v33, v33, v45 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v23, v41, v23 -; GCN-NEXT: v_or_b32_e32 v31, v57, v31 -; GCN-NEXT: v_or_b32_e32 v17, v17, v27 -; GCN-NEXT: v_or_b32_e32 v27, v58, v40 -; GCN-NEXT: v_or_b32_e32 v35, v59, v56 -; GCN-NEXT: v_or_b32_e32 v38, v46, v49 -; GCN-NEXT: v_or_b32_e32 v39, v44, v51 -; GCN-NEXT: v_or_b32_e32 v7, v7, v36 -; GCN-NEXT: v_or_b32_e32 v36, v54, v37 -; GCN-NEXT: v_or_b32_e32 v9, v9, v30 -; GCN-NEXT: v_or_b32_e32 v30, v48, v32 -; GCN-NEXT: v_or_b32_e32 v24, v34, v24 -; GCN-NEXT: v_or_b32_e32 v11, v11, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v20 -; GCN-NEXT: v_or_b32_e32 v20, v28, v26 -; GCN-NEXT: v_or_b32_e32 v21, v21, v33 -; GCN-NEXT: v_or_b32_e32 v23, v29, v23 -; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f32_to_v64i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 8 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v24, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 8 +; SI-NEXT: v_alignbit_b32 v29, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v30, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v36, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v41, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v6 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 8 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v24, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 8 +; SI-NEXT: v_alignbit_b32 v29, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v30, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v36, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v41, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v6 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v46 +; SI-NEXT: v_or_b32_e32 v46, v46, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v46 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v17, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v64i8: ; VI: ; %bb.0: @@ -11648,7 +23412,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -11701,9 +23465,9 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -11771,7 +23535,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 @@ -11979,7 +23743,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -12032,9 +23796,9 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -12102,7 +23866,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 @@ -12262,7 +24026,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -12296,9 +24060,9 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v15, 1.0, v15 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v7, 1.0, v7 @@ -12340,7 +24104,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB24_4: ; %end +; GFX11-TRUE16-NEXT: .LBB48_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -12552,7 +24316,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -12602,9 +24366,9 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v15, 1.0, v15 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v7, 1.0, v7 @@ -12662,7 +24426,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -12832,606 +24596,2717 @@ end: ret <64 x i8> %phi } +define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f32_to_v64i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_mov_b32_e32 v28, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: v_mov_b32_e32 v14, s21 +; SI-NEXT: v_mov_b32_e32 v11, s22 +; SI-NEXT: v_mov_b32_e32 v9, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v7, s25 +; SI-NEXT: v_mov_b32_e32 v6, s26 +; SI-NEXT: v_mov_b32_e32 v5, s27 +; SI-NEXT: v_mov_b32_e32 v4, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v3, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v10, v2, v1, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v13, v2, v1, 8 +; SI-NEXT: v_alignbit_b32 v16, v3, v4, 24 +; SI-NEXT: v_alignbit_b32 v17, v3, v4, 16 +; SI-NEXT: v_alignbit_b32 v18, v3, v4, 8 +; SI-NEXT: v_alignbit_b32 v21, v5, v6, 24 +; SI-NEXT: v_alignbit_b32 v22, v5, v6, 16 +; SI-NEXT: v_alignbit_b32 v23, v5, v6, 8 +; SI-NEXT: v_alignbit_b32 v29, v7, v8, 24 +; SI-NEXT: v_alignbit_b32 v30, v7, v8, 16 +; SI-NEXT: v_alignbit_b32 v31, v7, v8, 8 +; SI-NEXT: v_alignbit_b32 v35, v9, v11, 24 +; SI-NEXT: v_alignbit_b32 v36, v9, v11, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v11, 8 +; SI-NEXT: v_alignbit_b32 v49, v14, v15, 24 +; SI-NEXT: v_alignbit_b32 v50, v14, v15, 16 +; SI-NEXT: v_alignbit_b32 v52, v14, v15, 8 +; SI-NEXT: v_alignbit_b32 v55, v19, v20, 24 +; SI-NEXT: v_alignbit_b32 v41, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v43, v19, v20, 8 +; SI-NEXT: v_alignbit_b32 v46, v25, v28, 24 +; SI-NEXT: v_alignbit_b32 v56, v25, v28, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v25, v28, 8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v3 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v5 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v14 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v19 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v25 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v25 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v10, v2, v1, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v13, v2, v1, 8 +; SI-NEXT: v_alignbit_b32 v16, v3, v4, 24 +; SI-NEXT: v_alignbit_b32 v17, v3, v4, 16 +; SI-NEXT: v_alignbit_b32 v18, v3, v4, 8 +; SI-NEXT: v_alignbit_b32 v21, v5, v6, 24 +; SI-NEXT: v_alignbit_b32 v22, v5, v6, 16 +; SI-NEXT: v_alignbit_b32 v23, v5, v6, 8 +; SI-NEXT: v_alignbit_b32 v29, v7, v8, 24 +; SI-NEXT: v_alignbit_b32 v30, v7, v8, 16 +; SI-NEXT: v_alignbit_b32 v31, v7, v8, 8 +; SI-NEXT: v_alignbit_b32 v35, v9, v11, 24 +; SI-NEXT: v_alignbit_b32 v36, v9, v11, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v11, 8 +; SI-NEXT: v_alignbit_b32 v49, v14, v15, 24 +; SI-NEXT: v_alignbit_b32 v50, v14, v15, 16 +; SI-NEXT: v_alignbit_b32 v52, v14, v15, 8 +; SI-NEXT: v_alignbit_b32 v55, v19, v20, 24 +; SI-NEXT: v_alignbit_b32 v41, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v43, v19, v20, 8 +; SI-NEXT: v_alignbit_b32 v46, v25, v28, 24 +; SI-NEXT: v_alignbit_b32 v56, v25, v28, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v25, v28, 8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v3 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v5 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v14 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v19 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v25 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v25 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v28, v28, v58 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v46 +; SI-NEXT: v_or_b32_e32 v46, v46, v56 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v28, v28, v46 +; SI-NEXT: v_or_b32_e32 v10, v25, v10 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v63 +; SI-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v62 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v25, v28, v25 +; SI-NEXT: v_or_b32_e32 v10, v10, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v10, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v55 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v20, v25, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v10, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v59 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v52 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v49 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v10, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v57 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v10, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v35 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v44 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v40 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v29 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v54 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v51 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v23 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v16f32_to_v64i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v63, s30, 0 +; VI-NEXT: v_writelane_b32 v63, s31, 1 +; VI-NEXT: v_writelane_b32 v63, s34, 2 +; VI-NEXT: v_writelane_b32 v63, s35, 3 +; VI-NEXT: v_writelane_b32 v63, s36, 4 +; VI-NEXT: v_writelane_b32 v63, s37, 5 +; VI-NEXT: v_writelane_b32 v63, s38, 6 +; VI-NEXT: v_writelane_b32 v63, s39, 7 +; VI-NEXT: v_writelane_b32 v63, s48, 8 +; VI-NEXT: v_writelane_b32 v63, s49, 9 +; VI-NEXT: v_writelane_b32 v63, s50, 10 +; VI-NEXT: v_writelane_b32 v63, s51, 11 +; VI-NEXT: v_writelane_b32 v63, s52, 12 +; VI-NEXT: v_writelane_b32 v63, s53, 13 +; VI-NEXT: v_writelane_b32 v63, s54, 14 +; VI-NEXT: v_writelane_b32 v63, s55, 15 +; VI-NEXT: v_writelane_b32 v63, s64, 16 +; VI-NEXT: v_writelane_b32 v63, s65, 17 +; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB49_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s59, s5, 8 +; VI-NEXT: s_lshr_b32 s58, s4, 16 +; VI-NEXT: s_lshr_b32 s60, s4, 8 +; VI-NEXT: s_lshr_b32 s61, s29, 24 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s72, s29, 8 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s73, s28, 8 +; VI-NEXT: s_lshr_b32 s74, s27, 24 +; VI-NEXT: s_lshr_b32 s75, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s27, 8 +; VI-NEXT: s_lshr_b32 s76, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 8 +; VI-NEXT: s_lshr_b32 s79, s25, 24 +; VI-NEXT: s_lshr_b32 s88, s25, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 8 +; VI-NEXT: s_lshr_b32 s89, s24, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 8 +; VI-NEXT: s_lshr_b32 s30, s23, 24 +; VI-NEXT: s_lshr_b32 s31, s23, 16 +; VI-NEXT: s_lshr_b32 s35, s23, 8 +; VI-NEXT: s_lshr_b32 s34, s22, 16 +; VI-NEXT: s_lshr_b32 s36, s22, 8 +; VI-NEXT: s_lshr_b32 s37, s21, 24 +; VI-NEXT: s_lshr_b32 s38, s21, 16 +; VI-NEXT: s_lshr_b32 s48, s21, 8 +; VI-NEXT: s_lshr_b32 s39, s20, 16 +; VI-NEXT: s_lshr_b32 s49, s20, 8 +; VI-NEXT: s_lshr_b32 s50, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s53, s19, 8 +; VI-NEXT: s_lshr_b32 s52, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s55, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s66, s17, 8 +; VI-NEXT: s_lshr_b32 s65, s16, 16 +; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB49_4 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v6, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s5, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s4, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s24, 1.0 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[5:6] +; VI-NEXT: v_add_f32_e64 v10, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s22, 1.0 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_add_f32_e64 v12, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v11, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s29, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s28, 1.0 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[9:10] +; VI-NEXT: v_add_f32_e64 v16, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v15, s18, 1.0 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12] +; VI-NEXT: v_add_f32_e64 v18, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v17, s16, 1.0 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[15:16] +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[17:18] +; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v17 +; VI-NEXT: s_branch .LBB49_5 +; VI-NEXT: .LBB49_3: +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: s_branch .LBB49_2 +; VI-NEXT: .LBB49_4: +; VI-NEXT: v_mov_b32_e32 v20, s44 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v20, s42 +; VI-NEXT: v_mov_b32_e32 v17, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v11, s20 +; VI-NEXT: v_mov_b32_e32 v12, s21 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v7, s24 +; VI-NEXT: v_mov_b32_e32 v8, s25 +; VI-NEXT: v_mov_b32_e32 v5, s26 +; VI-NEXT: v_mov_b32_e32 v6, s27 +; VI-NEXT: v_mov_b32_e32 v3, s28 +; VI-NEXT: v_mov_b32_e32 v4, s29 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v19, s67 +; VI-NEXT: v_mov_b32_e32 v62, s65 +; VI-NEXT: v_mov_b32_e32 v13, s66 +; VI-NEXT: v_mov_b32_e32 v60, s64 +; VI-NEXT: v_mov_b32_e32 v61, s55 +; VI-NEXT: v_mov_b32_e32 v58, s54 +; VI-NEXT: v_mov_b32_e32 v59, s52 +; VI-NEXT: v_mov_b32_e32 v57, s53 +; VI-NEXT: v_mov_b32_e32 v47, s51 +; VI-NEXT: v_mov_b32_e32 v56, s50 +; VI-NEXT: v_mov_b32_e32 v46, s49 +; VI-NEXT: v_mov_b32_e32 v45, s39 +; VI-NEXT: v_mov_b32_e32 v44, s48 +; VI-NEXT: v_mov_b32_e32 v42, s38 +; VI-NEXT: v_mov_b32_e32 v43, s37 +; VI-NEXT: v_mov_b32_e32 v41, s36 +; VI-NEXT: v_mov_b32_e32 v40, s34 +; VI-NEXT: v_mov_b32_e32 v55, s35 +; VI-NEXT: v_mov_b32_e32 v53, s31 +; VI-NEXT: v_mov_b32_e32 v54, s30 +; VI-NEXT: v_mov_b32_e32 v52, s91 +; VI-NEXT: v_mov_b32_e32 v51, s89 +; VI-NEXT: v_mov_b32_e32 v50, s90 +; VI-NEXT: v_mov_b32_e32 v48, s88 +; VI-NEXT: v_mov_b32_e32 v49, s79 +; VI-NEXT: v_mov_b32_e32 v39, s78 +; VI-NEXT: v_mov_b32_e32 v38, s76 +; VI-NEXT: v_mov_b32_e32 v37, s77 +; VI-NEXT: v_mov_b32_e32 v35, s75 +; VI-NEXT: v_mov_b32_e32 v36, s74 +; VI-NEXT: v_mov_b32_e32 v34, s73 +; VI-NEXT: v_mov_b32_e32 v33, s63 +; VI-NEXT: v_mov_b32_e32 v32, s72 +; VI-NEXT: v_mov_b32_e32 v30, s62 +; VI-NEXT: v_mov_b32_e32 v31, s61 +; VI-NEXT: v_mov_b32_e32 v29, s60 +; VI-NEXT: v_mov_b32_e32 v28, s58 +; VI-NEXT: v_mov_b32_e32 v27, s59 +; VI-NEXT: v_mov_b32_e32 v14, s57 +; VI-NEXT: v_mov_b32_e32 v26, s56 +; VI-NEXT: v_mov_b32_e32 v22, s12 +; VI-NEXT: v_mov_b32_e32 v23, s10 +; VI-NEXT: v_mov_b32_e32 v24, s8 +; VI-NEXT: v_mov_b32_e32 v25, s6 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v20, s40 +; VI-NEXT: v_mov_b32_e32 v21, s14 +; VI-NEXT: .LBB49_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_or_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v25 +; VI-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v62, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v61 +; VI-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v17, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v24 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v58 +; VI-NEXT: v_or_b32_sdwa v13, v59, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v57 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v56 +; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v46 +; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v23 +; VI-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v13, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v44 +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v43 +; VI-NEXT: v_or_b32_sdwa v12, v42, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v12, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v41 +; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v22 +; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v11, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v55 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v54 +; VI-NEXT: v_or_b32_sdwa v10, v53, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v10, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v52 +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v21 +; VI-NEXT: v_or_b32_sdwa v9, v51, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v9, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v50 +; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; VI-NEXT: v_or_b32_sdwa v8, v48, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v8, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v39 +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v20 +; VI-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v37 +; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v36 +; VI-NEXT: v_or_b32_sdwa v6, v35, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v6, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v34 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_readlane_b32 s67, v63, 19 +; VI-NEXT: v_readlane_b32 s66, v63, 18 +; VI-NEXT: v_readlane_b32 s65, v63, 17 +; VI-NEXT: v_readlane_b32 s64, v63, 16 +; VI-NEXT: v_readlane_b32 s55, v63, 15 +; VI-NEXT: v_readlane_b32 s54, v63, 14 +; VI-NEXT: v_readlane_b32 s53, v63, 13 +; VI-NEXT: v_readlane_b32 s52, v63, 12 +; VI-NEXT: v_readlane_b32 s51, v63, 11 +; VI-NEXT: v_readlane_b32 s50, v63, 10 +; VI-NEXT: v_readlane_b32 s49, v63, 9 +; VI-NEXT: v_readlane_b32 s48, v63, 8 +; VI-NEXT: v_readlane_b32 s39, v63, 7 +; VI-NEXT: v_readlane_b32 s38, v63, 6 +; VI-NEXT: v_readlane_b32 s37, v63, 5 +; VI-NEXT: v_readlane_b32 s36, v63, 4 +; VI-NEXT: v_readlane_b32 s35, v63, 3 +; VI-NEXT: v_readlane_b32 s34, v63, 2 +; VI-NEXT: v_readlane_b32 s31, v63, 1 +; VI-NEXT: v_readlane_b32 s30, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; VI-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; VI-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f32_to_v64i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v63, s30, 0 +; GFX9-NEXT: v_writelane_b32 v63, s31, 1 +; GFX9-NEXT: v_writelane_b32 v63, s34, 2 +; GFX9-NEXT: v_writelane_b32 v63, s35, 3 +; GFX9-NEXT: v_writelane_b32 v63, s36, 4 +; GFX9-NEXT: v_writelane_b32 v63, s37, 5 +; GFX9-NEXT: v_writelane_b32 v63, s38, 6 +; GFX9-NEXT: v_writelane_b32 v63, s39, 7 +; GFX9-NEXT: v_writelane_b32 v63, s48, 8 +; GFX9-NEXT: v_writelane_b32 v63, s49, 9 +; GFX9-NEXT: v_writelane_b32 v63, s50, 10 +; GFX9-NEXT: v_writelane_b32 v63, s51, 11 +; GFX9-NEXT: v_writelane_b32 v63, s52, 12 +; GFX9-NEXT: v_writelane_b32 v63, s53, 13 +; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s59, s5, 8 +; GFX9-NEXT: s_lshr_b32 s58, s4, 16 +; GFX9-NEXT: s_lshr_b32 s60, s4, 8 +; GFX9-NEXT: s_lshr_b32 s61, s29, 24 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s72, s29, 8 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s73, s28, 8 +; GFX9-NEXT: s_lshr_b32 s74, s27, 24 +; GFX9-NEXT: s_lshr_b32 s75, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s27, 8 +; GFX9-NEXT: s_lshr_b32 s76, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s26, 8 +; GFX9-NEXT: s_lshr_b32 s79, s25, 24 +; GFX9-NEXT: s_lshr_b32 s88, s25, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 8 +; GFX9-NEXT: s_lshr_b32 s89, s24, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 8 +; GFX9-NEXT: s_lshr_b32 s92, s23, 24 +; GFX9-NEXT: s_lshr_b32 s93, s23, 16 +; GFX9-NEXT: s_lshr_b32 s95, s23, 8 +; GFX9-NEXT: s_lshr_b32 s94, s22, 16 +; GFX9-NEXT: s_lshr_b32 s30, s22, 8 +; GFX9-NEXT: s_lshr_b32 s31, s21, 24 +; GFX9-NEXT: s_lshr_b32 s34, s21, 16 +; GFX9-NEXT: s_lshr_b32 s36, s21, 8 +; GFX9-NEXT: s_lshr_b32 s35, s20, 16 +; GFX9-NEXT: s_lshr_b32 s37, s20, 8 +; GFX9-NEXT: s_lshr_b32 s38, s19, 24 +; GFX9-NEXT: s_lshr_b32 s39, s19, 16 +; GFX9-NEXT: s_lshr_b32 s49, s19, 8 +; GFX9-NEXT: s_lshr_b32 s48, s18, 16 +; GFX9-NEXT: s_lshr_b32 s50, s18, 8 +; GFX9-NEXT: s_lshr_b32 s51, s17, 24 +; GFX9-NEXT: s_lshr_b32 s52, s17, 16 +; GFX9-NEXT: s_lshr_b32 s54, s17, 8 +; GFX9-NEXT: s_lshr_b32 s53, s16, 16 +; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v6, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s5, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s4, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s24, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] +; GFX9-NEXT: v_add_f32_e64 v10, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s22, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8] +; GFX9-NEXT: v_add_f32_e64 v12, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v11, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s29, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s28, 1.0 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] +; GFX9-NEXT: v_add_f32_e64 v16, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v15, s18, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] +; GFX9-NEXT: v_add_f32_e64 v20, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v19, s16, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX9-NEXT: s_branch .LBB49_5 +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v21, s44 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v21, s42 +; GFX9-NEXT: v_mov_b32_e32 v19, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v11, s20 +; GFX9-NEXT: v_mov_b32_e32 v12, s21 +; GFX9-NEXT: v_mov_b32_e32 v9, s22 +; GFX9-NEXT: v_mov_b32_e32 v10, s23 +; GFX9-NEXT: v_mov_b32_e32 v7, s24 +; GFX9-NEXT: v_mov_b32_e32 v8, s25 +; GFX9-NEXT: v_mov_b32_e32 v5, s26 +; GFX9-NEXT: v_mov_b32_e32 v6, s27 +; GFX9-NEXT: v_mov_b32_e32 v3, s28 +; GFX9-NEXT: v_mov_b32_e32 v4, s29 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v17, s55 +; GFX9-NEXT: v_mov_b32_e32 v62, s53 +; GFX9-NEXT: v_mov_b32_e32 v13, s54 +; GFX9-NEXT: v_mov_b32_e32 v60, s52 +; GFX9-NEXT: v_mov_b32_e32 v61, s51 +; GFX9-NEXT: v_mov_b32_e32 v58, s50 +; GFX9-NEXT: v_mov_b32_e32 v59, s48 +; GFX9-NEXT: v_mov_b32_e32 v57, s49 +; GFX9-NEXT: v_mov_b32_e32 v47, s39 +; GFX9-NEXT: v_mov_b32_e32 v56, s38 +; GFX9-NEXT: v_mov_b32_e32 v46, s37 +; GFX9-NEXT: v_mov_b32_e32 v45, s35 +; GFX9-NEXT: v_mov_b32_e32 v44, s36 +; GFX9-NEXT: v_mov_b32_e32 v42, s34 +; GFX9-NEXT: v_mov_b32_e32 v43, s31 +; GFX9-NEXT: v_mov_b32_e32 v41, s30 +; GFX9-NEXT: v_mov_b32_e32 v40, s94 +; GFX9-NEXT: v_mov_b32_e32 v55, s95 +; GFX9-NEXT: v_mov_b32_e32 v53, s93 +; GFX9-NEXT: v_mov_b32_e32 v54, s92 +; GFX9-NEXT: v_mov_b32_e32 v52, s91 +; GFX9-NEXT: v_mov_b32_e32 v51, s89 +; GFX9-NEXT: v_mov_b32_e32 v50, s90 +; GFX9-NEXT: v_mov_b32_e32 v48, s88 +; GFX9-NEXT: v_mov_b32_e32 v49, s79 +; GFX9-NEXT: v_mov_b32_e32 v39, s78 +; GFX9-NEXT: v_mov_b32_e32 v38, s76 +; GFX9-NEXT: v_mov_b32_e32 v37, s77 +; GFX9-NEXT: v_mov_b32_e32 v35, s75 +; GFX9-NEXT: v_mov_b32_e32 v36, s74 +; GFX9-NEXT: v_mov_b32_e32 v34, s73 +; GFX9-NEXT: v_mov_b32_e32 v33, s63 +; GFX9-NEXT: v_mov_b32_e32 v32, s72 +; GFX9-NEXT: v_mov_b32_e32 v30, s62 +; GFX9-NEXT: v_mov_b32_e32 v31, s61 +; GFX9-NEXT: v_mov_b32_e32 v29, s60 +; GFX9-NEXT: v_mov_b32_e32 v28, s58 +; GFX9-NEXT: v_mov_b32_e32 v27, s59 +; GFX9-NEXT: v_mov_b32_e32 v14, s57 +; GFX9-NEXT: v_mov_b32_e32 v18, s56 +; GFX9-NEXT: v_mov_b32_e32 v23, s12 +; GFX9-NEXT: v_mov_b32_e32 v24, s10 +; GFX9-NEXT: v_mov_b32_e32 v25, s8 +; GFX9-NEXT: v_mov_b32_e32 v26, s6 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v21, s40 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: .LBB49_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v26 +; GFX9-NEXT: v_or_b32_sdwa v19, v62, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v13, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v13, v59, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v57 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v12, v42, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v10, v53, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v9, v51, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX9-NEXT: v_or_b32_sdwa v8, v48, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v6, v35, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s55, v63, 15 +; GFX9-NEXT: v_readlane_b32 s54, v63, 14 +; GFX9-NEXT: v_readlane_b32 s53, v63, 13 +; GFX9-NEXT: v_readlane_b32 s52, v63, 12 +; GFX9-NEXT: v_readlane_b32 s51, v63, 11 +; GFX9-NEXT: v_readlane_b32 s50, v63, 10 +; GFX9-NEXT: v_readlane_b32 s49, v63, 9 +; GFX9-NEXT: v_readlane_b32 s48, v63, 8 +; GFX9-NEXT: v_readlane_b32 s39, v63, 7 +; GFX9-NEXT: v_readlane_b32 s38, v63, 6 +; GFX9-NEXT: v_readlane_b32 s37, v63, 5 +; GFX9-NEXT: v_readlane_b32 s36, v63, 4 +; GFX9-NEXT: v_readlane_b32 s35, v63, 3 +; GFX9-NEXT: v_readlane_b32 s34, v63, 2 +; GFX9-NEXT: v_readlane_b32 s31, v63, 1 +; GFX9-NEXT: v_readlane_b32 s30, v63, 0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v16f32_to_v64i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 10 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, s17, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, s16, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v22, s3, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v21, s2, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v26, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, s0, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, s19, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, s21, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, s23, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s25, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, s27, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, s26, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s24, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, s22, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, s20, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, s18, 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: s_branch .LBB49_5 +; GFX11-TRUE16-NEXT: .LBB49_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr95_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, s31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s30 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, s95 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s93 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, s92 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s91 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s90 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s89 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s88 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s79 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s78 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s77 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s76 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s75 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s74 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s73 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s72 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s61 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s59 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s57 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s47 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s4 +; GFX11-TRUE16-NEXT: .LBB49_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xff, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v87, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xff, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, v85, v84 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v82, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v22, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, v80, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v17, v70 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v69, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v67, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v21, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v22, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v17, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v18, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v18, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v23, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v21, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v23, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v38, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v68, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v14, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v9, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v10, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v7, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v18, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v18, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v7, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v6 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[66:69], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v40, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v40, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v16f32_to_v64i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s42, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, s19, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, s18, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, s17, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, s16, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, s3, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, s2, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, s0, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, s21, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, s23, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, s25, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, s27, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, s26, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, s24, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, s22, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, s20, 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v23 +; GFX11-FAKE16-NEXT: s_branch .LBB49_5 +; GFX11-FAKE16-NEXT: .LBB49_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s0 :: v_dual_mov_b32 v24, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v20, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s16 :: v_dual_mov_b32 v16, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s18 :: v_dual_mov_b32 v14, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s22 :: v_dual_mov_b32 v6, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s24 :: v_dual_mov_b32 v4, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s26 :: v_dual_mov_b32 v2, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s49 :: v_dual_mov_b32 v87, s39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s48 :: v_dual_mov_b32 v85, s38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, s37 :: v_dual_mov_b32 v83, s36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s34 :: v_dual_mov_b32 v81, s35 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s31 :: v_dual_mov_b32 v71, s30 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, vcc_hi :: v_dual_mov_b32 v69, s94 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s95 :: v_dual_mov_b32 v67, s93 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s92 :: v_dual_mov_b32 v65, s91 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s89 :: v_dual_mov_b32 v55, s90 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s88 :: v_dual_mov_b32 v53, s79 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s78 :: v_dual_mov_b32 v51, s76 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s77 :: v_dual_mov_b32 v49, s75 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s74 :: v_dual_mov_b32 v39, s73 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s63 :: v_dual_mov_b32 v37, s72 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s62 :: v_dual_mov_b32 v35, s61 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s60 :: v_dual_mov_b32 v33, s58 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s59 :: v_dual_mov_b32 v31, s57 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s56 :: v_dual_mov_b32 v29, s47 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v7, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s46 :: v_dual_mov_b32 v11, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s44 :: v_dual_mov_b32 v17, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s43 :: v_dual_mov_b32 v21, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s10 :: v_dual_mov_b32 v26, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s6 :: v_dual_mov_b32 v28, s4 +; GFX11-FAKE16-NEXT: .LBB49_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xff, v87 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xff, v85 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v87, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, v23, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v85, v84 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v87, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v69, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v67, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v83, v24, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v84, v19, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, v20, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v15, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v16, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v28, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v13, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v14, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v9, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v10, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v5, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v17, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v17, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v8 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[82:85], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v40, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i8_to_v16f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v34, v12 -; GCN-NEXT: v_mov_b32_e32 v37, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v36, v6 -; GCN-NEXT: v_mov_b32_e32 v32, v4 -; GCN-NEXT: v_mov_b32_e32 v35, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v44 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 24, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 24, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 24, v46 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_or_b32_e32 v0, v0, v42 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_or_b32_e32 v1, v1, v41 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v40 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_or_b32_e32 v3, v3, v55 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v37 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v38 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v50 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v54 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v53 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v52 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v49 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v48 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v39 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v63 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v62 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v61 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32 -; GCN-NEXT: v_or_b32_e32 v17, v33, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v34 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v28, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; GCN-NEXT: v_or_b32_e32 v27, v35, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v29, v36, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v37 -; GCN-NEXT: v_or_b32_e32 v31, v38, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v18, v18, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v25, v59, v25 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v30, v57, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v32, v58, v8 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v9 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v8, v10 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v11 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v8, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v8, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v8, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v19, v46, v19 -; GCN-NEXT: v_or_b32_e32 v0, v0, v7 -; GCN-NEXT: v_or_b32_e32 v1, v1, v6 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: v_or_b32_e32 v4, v17, v21 -; GCN-NEXT: v_or_b32_e32 v5, v20, v22 -; GCN-NEXT: v_or_b32_e32 v6, v23, v24 -; GCN-NEXT: v_or_b32_e32 v7, v26, v28 -; GCN-NEXT: v_or_b32_e32 v8, v27, v25 -; GCN-NEXT: v_or_b32_e32 v9, v29, v30 -; GCN-NEXT: v_or_b32_e32 v10, v31, v32 -; GCN-NEXT: v_or_b32_e32 v11, v33, v34 -; GCN-NEXT: v_or_b32_e32 v12, v35, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v16 -; GCN-NEXT: v_or_b32_e32 v15, v18, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; kill: killed $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v41, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v40, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v55, v3 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v50 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v51 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GCN-NEXT: v_or_b32_e32 v6, v17, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v29, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v27, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GCN-NEXT: v_or_b32_e32 v17, v44, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; GCN-NEXT: v_or_b32_e32 v20, v45, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GCN-NEXT: v_or_b32_e32 v23, v56, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v30 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v32 -; GCN-NEXT: v_or_b32_e32 v29, v43, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v19, v47, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v34 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v31, v36 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_or_b32_e32 v16, v59, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 -; GCN-NEXT: v_or_b32_e32 v18, v57, v18 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: v_or_b32_e32 v22, v58, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x300, v19 -; GCN-NEXT: v_or_b32_e32 v30, v46, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v0, v31, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v10, v9 -; GCN-NEXT: v_or_b32_e32 v6, v12, v11 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v16, v15 -; GCN-NEXT: v_or_b32_e32 v9, v18, v17 -; GCN-NEXT: v_or_b32_e32 v10, v22, v20 -; GCN-NEXT: v_or_b32_e32 v11, v24, v23 -; GCN-NEXT: v_or_b32_e32 v12, v26, v25 -; GCN-NEXT: v_or_b32_e32 v13, v28, v27 -; GCN-NEXT: v_or_b32_e32 v14, v21, v29 -; GCN-NEXT: v_or_b32_e32 v15, v30, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i8_to_v16f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v8 +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v50 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v41 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v45 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v56 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v13 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v53, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v23, v15 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v8, v8, v62 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v59 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v53, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v23, v15 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v17, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v30, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v40, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v13, v50, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v25, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v21, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v16f32: ; VI: ; %bb.0: @@ -13545,7 +27420,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -13709,9 +27584,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -13860,7 +27735,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -14005,7 +27880,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -14169,9 +28044,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr21 -; GFX9-NEXT: .LBB25_2: ; %Flow +; GFX9-NEXT: .LBB50_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 +; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -14320,7 +28195,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: .LBB50_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -14447,15 +28322,15 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_4 -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB25_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h @@ -14637,8 +28512,8 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-TRUE16-NEXT: .LBB25_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 @@ -14917,15 +28792,15 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_4 -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB25_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v37 @@ -15104,8 +28979,8 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-FAKE16-NEXT: .LBB25_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-FAKE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -15302,35 +29177,2138 @@ end: ret <16 x float> %phi } +define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i8_to_v16f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v51 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v30 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v42 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v44 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_mov_b32_e32 v61, v57 +; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v24, v26 +; SI-NEXT: v_mov_b32_e32 v26, v28 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v23, v21 +; SI-NEXT: v_mov_b32_e32 v21, v19 +; SI-NEXT: v_mov_b32_e32 v19, v17 +; SI-NEXT: v_mov_b32_e32 v17, v13 +; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v27, v42 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v27, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v24, v26 +; SI-NEXT: v_mov_b32_e32 v26, v28 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v61, v57 +; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v23, v21 +; SI-NEXT: v_mov_b32_e32 v21, v19 +; SI-NEXT: v_mov_b32_e32 v19, v17 +; SI-NEXT: v_mov_b32_e32 v17, v13 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v64i8_to_v16f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v48, v14 +; VI-NEXT: v_mov_b32_e32 v49, v13 +; VI-NEXT: v_mov_b32_e32 v50, v12 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v31, v8 +; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v27 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v2 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v6 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; VI-NEXT: v_or_b32_sdwa v0, v39, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v42, v7 +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v8 +; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v26, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v10 +; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v56 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v11 +; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v19, v52 +; VI-NEXT: v_mov_b32_e32 v52, v53 +; VI-NEXT: v_mov_b32_e32 v53, v12 +; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v23, v13 +; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v47, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_mov_b32_e32 v27, v3 +; VI-NEXT: v_or_b32_sdwa v2, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v48 +; VI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; VI-NEXT: v_or_b32_sdwa v17, v42, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 +; VI-NEXT: v_or_b32_e32 v7, v7, v16 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v7, v7, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v20 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 +; VI-NEXT: v_or_b32_sdwa v16, v41, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v24 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 +; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v40, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v36 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v28 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v51, vcc, 3, v57 +; VI-NEXT: v_lshlrev_b32_e32 v10, 24, v29 +; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v55, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v51 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 +; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v46 +; VI-NEXT: v_lshlrev_b32_e32 v11, 24, v56 +; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v54, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v39 +; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v38 +; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v62 +; VI-NEXT: v_lshlrev_b32_e32 v12, 24, v45 +; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v53, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v37 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v36 +; VI-NEXT: v_lshlrev_b32_e32 v13, 24, v61 +; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v23, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; VI-NEXT: v_lshlrev_b32_e32 v14, 24, v59 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s9, s17, 8 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s18, 0xff +; VI-NEXT: s_lshl_b32 s8, s19, 24 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s21, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s9, s20, 0xff +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s22, 0xff +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s6, s23, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s9 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s5, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v31 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v47 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v34 +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v63 +; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v49 +; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: v_or_b32_sdwa v31, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_or_b32_sdwa v25, v52, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x300, v31 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v16, v16, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v5, v5, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v25 +; VI-NEXT: v_or_b32_e32 v6, v6, v21 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s8, s8, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: v_mov_b32_e32 v19, v52 +; VI-NEXT: v_mov_b32_e32 v27, v3 +; VI-NEXT: v_mov_b32_e32 v52, v53 +; VI-NEXT: v_mov_b32_e32 v42, v7 +; VI-NEXT: v_mov_b32_e32 v41, v8 +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_mov_b32_e32 v55, v10 +; VI-NEXT: v_mov_b32_e32 v54, v11 +; VI-NEXT: v_mov_b32_e32 v53, v12 +; VI-NEXT: v_mov_b32_e32 v23, v13 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v64i8_to_v16f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, v30 +; GFX9-NEXT: v_mov_b32_e32 v61, v28 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v48 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v35 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v42 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v36, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v3 +; GFX9-NEXT: v_mov_b32_e32 v48, v8 +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v63, v59 +; GFX9-NEXT: v_mov_b32_e32 v59, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v47 +; GFX9-NEXT: v_mov_b32_e32 v47, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v34, v39 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v45, v25 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v24 +; GFX9-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-NEXT: v_mov_b32_e32 v26, v61 +; GFX9-NEXT: v_mov_b32_e32 v61, v23 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v29, v33 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v30, v37 +; GFX9-NEXT: v_mov_b32_e32 v37, v27 +; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s5, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s29, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v44 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v35 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s5, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v39 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v29, v33 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v8 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_mov_b32_e32 v16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v24 +; GFX9-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-NEXT: v_mov_b32_e32 v26, v61 +; GFX9-NEXT: v_mov_b32_e32 v30, v37 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_mov_b32_e32 v40, v3 +; GFX9-NEXT: v_mov_b32_e32 v63, v59 +; GFX9-NEXT: v_mov_b32_e32 v36, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_mov_b32_e32 v59, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v47 +; GFX9-NEXT: v_mov_b32_e32 v47, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v25 +; GFX9-NEXT: v_mov_b32_e32 v61, v23 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_mov_b32_e32 v37, v27 +; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v16f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v96, v97 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v98, v99 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v96, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v86 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v21, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v16f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v38 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v85 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v69 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v96, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v96, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v32 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v38 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v84, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v85, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v82, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v68, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v69, v11 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v70, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v80, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v65, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v67, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v64, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v49 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v27, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v29, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v54, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v55, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v17, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v19, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v21, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v23, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i64_to_v8f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: .LBB26_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i64_to_v8f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: .LBB52_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v8f64: ; VI: ; %bb.0: @@ -15339,7 +31317,7 @@ define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -15357,7 +31335,7 @@ define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: .LBB52_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -15368,7 +31346,7 @@ define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -15386,7 +31364,7 @@ define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc ; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: .LBB52_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -15398,7 +31376,7 @@ define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -15420,7 +31398,7 @@ define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo ; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: .LBB52_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -15440,27 +31418,224 @@ end: ret <8 x double> %phi } +define inreg <8 x double> @bitcast_v8i64_to_v8f64_scalar(<8 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i64_to_v8f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v8i64_to_v8f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v8i64_to_v8f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-LABEL: bitcast_v8i64_to_v8f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: .LBB53_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f64_to_v8i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f64_to_v8i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: .LBB54_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v8i64: ; VI: ; %bb.0: @@ -15469,7 +31644,7 @@ define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -15479,7 +31654,7 @@ define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -15490,7 +31665,7 @@ define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -15500,7 +31675,7 @@ define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: .LBB27_2: ; %end +; GFX9-NEXT: .LBB54_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -15512,7 +31687,7 @@ define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -15522,7 +31697,7 @@ define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: .LBB27_2: ; %end +; GFX11-NEXT: .LBB54_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -15542,102 +31717,272 @@ end: ret <8 x i64> %phi } +define inreg <8 x i64> @bitcast_v8f64_to_v8i64_scalar(<8 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f64_to_v8i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v8f64_to_v8i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v8f64_to_v8i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-LABEL: bitcast_v8f64_to_v8i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], s[26:27], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i64_to_v32i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v30, v15 -; GCN-NEXT: v_mov_b32_e32 v28, v14 -; GCN-NEXT: v_mov_b32_e32 v26, v13 -; GCN-NEXT: v_mov_b32_e32 v24, v12 -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v32, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v16, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i64_to_v32i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: v_mov_b32_e32 v24, v12 +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v32i16: ; VI: ; %bb.0: @@ -15646,7 +31991,7 @@ define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc @@ -15664,7 +32009,7 @@ define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -15675,7 +32020,7 @@ define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc @@ -15693,7 +32038,7 @@ define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -15705,7 +32050,7 @@ define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -15727,7 +32072,7 @@ define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB28_2: ; %end +; GFX11-NEXT: .LBB56_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -15747,183 +32092,434 @@ end: ret <32 x i16> %phi } +define inreg <32 x i16> @bitcast_v8i64_to_v32i16_scalar(<8 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i64_to_v32i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_mov_b32_e32 v30, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v26, s29 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v8i64_to_v32i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v8i64_to_v32i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_3 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB57_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: s_branch .LBB57_2 +; +; GFX11-LABEL: bitcast_v8i64_to_v32i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_3 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB57_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: s_branch .LBB57_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i16_to_v8i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v12 -; GCN-NEXT: v_mov_b32_e32 v36, v10 -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_4 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB29_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; GCN-NEXT: v_or_b32_e32 v0, v0, v54 -; GCN-NEXT: v_or_b32_e32 v1, v1, v55 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v2, v2, v39 -; GCN-NEXT: v_or_b32_e32 v3, v3, v48 -; GCN-NEXT: v_or_b32_e32 v4, v4, v49 -; GCN-NEXT: v_or_b32_e32 v5, v5, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v51 -; GCN-NEXT: v_or_b32_e32 v7, v7, v52 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_or_b32_e32 v15, v15, v53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: .LBB29_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v55, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_or_b32_e32 v4, v49, v4 -; GCN-NEXT: v_or_b32_e32 v5, v50, v5 -; GCN-NEXT: v_or_b32_e32 v6, v51, v6 -; GCN-NEXT: v_or_b32_e32 v7, v52, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_or_b32_e32 v14, v29, v14 -; GCN-NEXT: v_or_b32_e32 v15, v53, v15 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i16_to_v8i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v36, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v55 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_or_b32_e32 v5, v5, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_or_b32_e32 v10, v10, v48 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_or_b32_e32 v13, v13, v21 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: .LBB58_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v53, v5 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_or_b32_e32 v7, v51, v7 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v48, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 +; SI-NEXT: .LBB58_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v8i64: ; VI: ; %bb.0: @@ -15932,7 +32528,7 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 3 ; VI-NEXT: v_add_u16_e32 v16, 3, v15 @@ -15983,7 +32579,7 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v16, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v16, v0 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -15994,7 +32590,7 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -16012,7 +32608,7 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -16024,7 +32620,7 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -16042,7 +32638,7 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB29_2: ; %end +; GFX11-NEXT: .LBB58_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -16062,221 +32658,578 @@ end: ret <8 x i64> %phi } +define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i16_to_v8i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v26, v14 +; SI-NEXT: v_mov_b32_e32 v25, v12 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v32 +; SI-NEXT: v_or_b32_e32 v15, v0, v17 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v32i16_to_v8i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: s_cbranch_scc0 .LBB59_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_3 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s17, 3 +; VI-NEXT: s_and_b32 s10, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s18, 3 +; VI-NEXT: s_and_b32 s12, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s19, 3 +; VI-NEXT: s_and_b32 s14, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s20, 3 +; VI-NEXT: s_and_b32 s16, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s21, 3 +; VI-NEXT: s_and_b32 s18, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s22, 3 +; VI-NEXT: s_and_b32 s20, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s23, 3 +; VI-NEXT: s_and_b32 s22, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s24, 3 +; VI-NEXT: s_and_b32 s24, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s40, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s41, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s42, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s43, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s44, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s45, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s7, s45, s7 +; VI-NEXT: s_or_b32 s6, s44, s6 +; VI-NEXT: s_or_b32 s29, s43, s29 +; VI-NEXT: s_or_b32 s28, s42, s28 +; VI-NEXT: s_or_b32 s27, s41, s27 +; VI-NEXT: s_or_b32 s26, s40, s26 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_add_i32 s25, s24, 0x30000 +; VI-NEXT: s_add_i32 s24, s22, 0x30000 +; VI-NEXT: s_add_i32 s23, s20, 0x30000 +; VI-NEXT: s_add_i32 s22, s18, 0x30000 +; VI-NEXT: s_add_i32 s21, s16, 0x30000 +; VI-NEXT: s_add_i32 s20, s14, 0x30000 +; VI-NEXT: s_add_i32 s19, s12, 0x30000 +; VI-NEXT: s_add_i32 s18, s10, 0x30000 +; VI-NEXT: s_add_i32 s17, s8, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB59_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_4: +; VI-NEXT: s_branch .LBB59_2 +; +; GFX9-LABEL: bitcast_v32i16_to_v8i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_3 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB59_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: s_branch .LBB59_2 +; +; GFX11-LABEL: bitcast_v32i16_to_v8i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_3: +; GFX11-NEXT: s_branch .LBB59_2 +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i64_to_v32f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v15 -; GCN-NEXT: v_mov_b32_e32 v33, v14 -; GCN-NEXT: v_mov_b32_e32 v36, v13 -; GCN-NEXT: v_mov_b32_e32 v35, v12 -; GCN-NEXT: v_mov_b32_e32 v38, v11 -; GCN-NEXT: v_mov_b32_e32 v37, v10 -; GCN-NEXT: v_mov_b32_e32 v48, v9 -; GCN-NEXT: v_mov_b32_e32 v39, v8 -; GCN-NEXT: v_mov_b32_e32 v50, v7 -; GCN-NEXT: v_mov_b32_e32 v49, v6 -; GCN-NEXT: v_mov_b32_e32 v52, v5 -; GCN-NEXT: v_mov_b32_e32 v51, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v3 -; GCN-NEXT: v_mov_b32_e32 v53, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v1 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB30_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v49 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v52 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v51 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v54 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v53 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: .LBB30_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB30_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v55, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v54, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v52, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v50, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v48, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v38, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v36, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v34, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: .LBB30_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i64_to_v32f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v15 +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: v_mov_b32_e32 v35, v13 +; SI-NEXT: v_mov_b32_e32 v34, v12 +; SI-NEXT: v_mov_b32_e32 v37, v11 +; SI-NEXT: v_mov_b32_e32 v36, v10 +; SI-NEXT: v_mov_b32_e32 v39, v9 +; SI-NEXT: v_mov_b32_e32 v38, v8 +; SI-NEXT: v_mov_b32_e32 v49, v7 +; SI-NEXT: v_mov_b32_e32 v48, v6 +; SI-NEXT: v_mov_b32_e32 v51, v5 +; SI-NEXT: v_mov_b32_e32 v50, v4 +; SI-NEXT: v_mov_b32_e32 v53, v3 +; SI-NEXT: v_mov_b32_e32 v52, v2 +; SI-NEXT: v_mov_b32_e32 v55, v1 +; SI-NEXT: v_mov_b32_e32 v54, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_4 +; SI-NEXT: .LBB60_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB60_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: .LBB60_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v55, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v52 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v53, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v50 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v51, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v48 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v49, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v38 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v39, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v36 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v37, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v34 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v35, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v33, vcc +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v32f16: ; VI: ; %bb.0: @@ -16285,7 +33238,7 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB30_2 +; VI-NEXT: s_cbranch_execz .LBB60_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc @@ -16303,7 +33256,7 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB30_2: ; %end +; VI-NEXT: .LBB60_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -16314,7 +33267,7 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB30_2 +; GFX9-NEXT: s_cbranch_execz .LBB60_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc @@ -16332,7 +33285,7 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB30_2: ; %end +; GFX9-NEXT: .LBB60_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -16344,7 +33297,7 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-NEXT: s_cbranch_execz .LBB60_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -16366,7 +33319,7 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB30_2: ; %end +; GFX11-NEXT: .LBB60_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -16386,269 +33339,583 @@ end: ret <32 x half> %phi } +define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i64_to_v32f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: s_cbranch_scc0 .LBB61_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB61_3 +; SI-NEXT: .LBB61_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s8, s4, 16 +; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_add_u32 s10, s18, 3 +; SI-NEXT: s_addc_u32 s11, s19, 0 +; SI-NEXT: s_lshr_b32 s12, s10, 16 +; SI-NEXT: s_lshr_b32 s13, s11, 16 +; SI-NEXT: s_add_u32 s14, s20, 3 +; SI-NEXT: s_addc_u32 s15, s21, 0 +; SI-NEXT: s_lshr_b32 s16, s14, 16 +; SI-NEXT: s_lshr_b32 s17, s15, 16 +; SI-NEXT: s_add_u32 s18, s22, 3 +; SI-NEXT: s_addc_u32 s19, s23, 0 +; SI-NEXT: s_lshr_b32 s20, s18, 16 +; SI-NEXT: s_lshr_b32 s21, s19, 16 +; SI-NEXT: s_add_u32 s22, s24, 3 +; SI-NEXT: s_addc_u32 s23, s25, 0 +; SI-NEXT: s_lshr_b32 s24, s22, 16 +; SI-NEXT: s_lshr_b32 s25, s23, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_lshr_b32 s44, s6, 16 +; SI-NEXT: s_lshr_b32 s45, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB61_2 +; +; VI-LABEL: bitcast_v8i64_to_v32f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB61_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB61_3 +; VI-NEXT: .LBB61_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB61_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB61_4: +; VI-NEXT: s_branch .LBB61_2 +; +; GFX9-LABEL: bitcast_v8i64_to_v32f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB61_3 +; GFX9-NEXT: .LBB61_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB61_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB61_4: +; GFX9-NEXT: s_branch .LBB61_2 +; +; GFX11-LABEL: bitcast_v8i64_to_v32f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB61_3 +; GFX11-NEXT: .LBB61_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB61_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB61_4: +; GFX11-NEXT: s_branch .LBB61_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f16_to_v8i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB31_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; GCN-NEXT: v_or_b32_e32 v0, v44, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v52, v2 -; GCN-NEXT: v_or_b32_e32 v3, v50, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v36, v6 -; GCN-NEXT: v_or_b32_e32 v7, v34, v7 -; GCN-NEXT: v_or_b32_e32 v8, v33, v8 -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: v_or_b32_e32 v10, v31, v10 -; GCN-NEXT: v_or_b32_e32 v11, v21, v11 -; GCN-NEXT: v_or_b32_e32 v12, v19, v12 -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: v_or_b32_e32 v14, v17, v14 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB31_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB31_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v26, v24 -; GCN-NEXT: v_or_b32_e32 v10, v28, v27 -; GCN-NEXT: v_or_b32_e32 v11, v21, v29 -; GCN-NEXT: v_or_b32_e32 v12, v19, v25 -; GCN-NEXT: v_or_b32_e32 v13, v18, v23 -; GCN-NEXT: v_or_b32_e32 v14, v17, v22 -; GCN-NEXT: v_or_b32_e32 v15, v16, v20 -; GCN-NEXT: .LBB31_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f16_to_v8i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_or_b32_e32 v5, v51, v5 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 +; SI-NEXT: v_or_b32_e32 v8, v37, v8 +; SI-NEXT: v_or_b32_e32 v9, v35, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v31, v11 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB62_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v35 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v23 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: .LBB62_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f16_to_v8i64: ; VI: ; %bb.0: @@ -16657,7 +33924,7 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB31_2 +; VI-NEXT: s_cbranch_execz .LBB62_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v16, 0x200 ; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -16708,7 +33975,7 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v17 ; VI-NEXT: v_or_b32_e32 v0, v0, v16 -; VI-NEXT: .LBB31_2: ; %end +; VI-NEXT: .LBB62_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -16719,7 +33986,7 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB31_2 +; GFX9-NEXT: s_cbranch_execz .LBB62_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -16738,7 +34005,7 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB31_2: ; %end +; GFX9-NEXT: .LBB62_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -16750,7 +34017,7 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB31_2 +; GFX11-NEXT: s_cbranch_execz .LBB62_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -16768,7 +34035,7 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB31_2: ; %end +; GFX11-NEXT: .LBB62_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -16788,170 +34055,570 @@ end: ret <8 x i64> %phi } +define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f16_to_v8i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v40, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_or_b32_e32 v9, v29, v9 +; SI-NEXT: v_or_b32_e32 v10, v27, v10 +; SI-NEXT: v_or_b32_e32 v11, v25, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: .LBB63_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB63_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB63_2 +; +; VI-LABEL: bitcast_v32f16_to_v8i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB63_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB63_3 +; VI-NEXT: .LBB63_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v16, 0x200 +; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v17 +; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v17 +; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v17 +; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v17 +; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v17 +; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v17 +; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v17 +; VI-NEXT: v_or_b32_e32 v0, v0, v16 +; VI-NEXT: .LBB63_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB63_4: +; VI-NEXT: s_branch .LBB63_2 +; +; GFX9-LABEL: bitcast_v32f16_to_v8i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB63_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB63_3 +; GFX9-NEXT: .LBB63_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB63_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB63_4: +; GFX9-NEXT: s_branch .LBB63_2 +; +; GFX11-LABEL: bitcast_v32f16_to_v8i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB63_4 +; GFX11-NEXT: .LBB63_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB63_3: +; GFX11-NEXT: s_branch .LBB63_2 +; GFX11-NEXT: .LBB63_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i64_to_v32bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v55, v15 -; GCN-NEXT: v_mov_b32_e32 v54, v14 -; GCN-NEXT: v_mov_b32_e32 v53, v13 -; GCN-NEXT: v_mov_b32_e32 v52, v12 -; GCN-NEXT: v_mov_b32_e32 v51, v11 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v49, v9 -; GCN-NEXT: v_mov_b32_e32 v48, v8 -; GCN-NEXT: v_mov_b32_e32 v39, v7 -; GCN-NEXT: v_mov_b32_e32 v38, v6 -; GCN-NEXT: v_mov_b32_e32 v37, v5 -; GCN-NEXT: v_mov_b32_e32 v36, v4 -; GCN-NEXT: v_mov_b32_e32 v35, v3 -; GCN-NEXT: v_mov_b32_e32 v34, v2 -; GCN-NEXT: v_mov_b32_e32 v33, v1 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_4 -; GCN-NEXT: .LBB32_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB32_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v55 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v54 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v51 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v49 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v38 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB32_2 -; GCN-NEXT: .LBB32_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v35, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v36 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v37, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v38 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v39, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v49, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v50 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v51, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v52 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v53, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v54 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v55, vcc -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i64_to_v32bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v53, v13 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: v_mov_b32_e32 v51, v11 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v49, v9 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v39, v7 +; SI-NEXT: v_mov_b32_e32 v38, v6 +; SI-NEXT: v_mov_b32_e32 v37, v5 +; SI-NEXT: v_mov_b32_e32 v36, v4 +; SI-NEXT: v_mov_b32_e32 v35, v3 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_4 +; SI-NEXT: .LBB64_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB64_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v55 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v53 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_2 +; SI-NEXT: .LBB64_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v35, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v36 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v37, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v38 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v39, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v48 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v49, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v50 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v51, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v52 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v53, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v55, vcc +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v32bf16: ; VI: ; %bb.0: @@ -16960,7 +34627,7 @@ define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB32_2 +; VI-NEXT: s_cbranch_execz .LBB64_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc @@ -16978,7 +34645,7 @@ define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB32_2: ; %end +; VI-NEXT: .LBB64_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -16989,7 +34656,7 @@ define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB32_2 +; GFX9-NEXT: s_cbranch_execz .LBB64_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc @@ -17007,7 +34674,7 @@ define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB32_2: ; %end +; GFX9-NEXT: .LBB64_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -17019,7 +34686,7 @@ define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-NEXT: s_cbranch_execz .LBB64_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -17041,7 +34708,7 @@ define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB32_2: ; %end +; GFX11-NEXT: .LBB64_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -17061,237 +34728,551 @@ end: ret <32 x bfloat> %phi } +define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i64_to_v32bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s78, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s79, v1 +; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s79, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s79, 16 +; SI-NEXT: s_and_b32 s8, s78, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s78, 16 +; SI-NEXT: s_and_b32 s10, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_and_b32 s12, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s28, 16 +; SI-NEXT: s_and_b32 s14, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s27, 16 +; SI-NEXT: s_and_b32 s40, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s26, 16 +; SI-NEXT: s_and_b32 s42, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s25, 16 +; SI-NEXT: s_and_b32 s44, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s24, 16 +; SI-NEXT: s_and_b32 s46, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s23, 16 +; SI-NEXT: s_and_b32 s56, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s22, 16 +; SI-NEXT: s_and_b32 s58, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s21, 16 +; SI-NEXT: s_and_b32 s60, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s20, 16 +; SI-NEXT: s_and_b32 s62, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s19, 16 +; SI-NEXT: s_and_b32 s72, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s73, s18, 16 +; SI-NEXT: s_and_b32 s74, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s17, 16 +; SI-NEXT: s_and_b32 s76, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: .LBB65_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_add_u32 s16, s18, 3 +; SI-NEXT: s_addc_u32 s17, s19, 0 +; SI-NEXT: s_add_u32 s18, s20, 3 +; SI-NEXT: s_addc_u32 s19, s21, 0 +; SI-NEXT: s_add_u32 s20, s22, 3 +; SI-NEXT: s_addc_u32 s21, s23, 0 +; SI-NEXT: s_add_u32 s22, s24, 3 +; SI-NEXT: s_addc_u32 s23, s25, 0 +; SI-NEXT: s_add_u32 s24, s26, 3 +; SI-NEXT: s_addc_u32 s15, s27, 0 +; SI-NEXT: s_add_u32 s13, s28, 3 +; SI-NEXT: s_addc_u32 s11, s29, 0 +; SI-NEXT: s_add_u32 s9, s78, 3 +; SI-NEXT: s_addc_u32 s7, s79, 0 +; SI-NEXT: s_and_b32 s6, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s8, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s10, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s12, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s14, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_and_b32 s40, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s24, 16 +; SI-NEXT: s_and_b32 s42, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s23, 16 +; SI-NEXT: s_and_b32 s44, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s22, 16 +; SI-NEXT: s_and_b32 s46, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s21, 16 +; SI-NEXT: s_and_b32 s56, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s20, 16 +; SI-NEXT: s_and_b32 s58, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s19, 16 +; SI-NEXT: s_and_b32 s60, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s18, 16 +; SI-NEXT: s_and_b32 s62, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s17, 16 +; SI-NEXT: s_and_b32 s72, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s73, s16, 16 +; SI-NEXT: s_and_b32 s74, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s5, 16 +; SI-NEXT: s_and_b32 s76, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s4, 16 +; SI-NEXT: .LBB65_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s77 +; SI-NEXT: v_mov_b32_e32 v1, s76 +; SI-NEXT: v_mov_b32_e32 v2, s75 +; SI-NEXT: v_mov_b32_e32 v3, s74 +; SI-NEXT: v_mov_b32_e32 v4, s73 +; SI-NEXT: v_mov_b32_e32 v5, s72 +; SI-NEXT: v_mov_b32_e32 v6, s63 +; SI-NEXT: v_mov_b32_e32 v7, s62 +; SI-NEXT: v_mov_b32_e32 v8, s61 +; SI-NEXT: v_mov_b32_e32 v9, s60 +; SI-NEXT: v_mov_b32_e32 v10, s59 +; SI-NEXT: v_mov_b32_e32 v11, s58 +; SI-NEXT: v_mov_b32_e32 v12, s57 +; SI-NEXT: v_mov_b32_e32 v13, s56 +; SI-NEXT: v_mov_b32_e32 v14, s47 +; SI-NEXT: v_mov_b32_e32 v15, s46 +; SI-NEXT: v_mov_b32_e32 v16, s45 +; SI-NEXT: v_mov_b32_e32 v17, s44 +; SI-NEXT: v_mov_b32_e32 v18, s43 +; SI-NEXT: v_mov_b32_e32 v19, s42 +; SI-NEXT: v_mov_b32_e32 v20, s41 +; SI-NEXT: v_mov_b32_e32 v21, s40 +; SI-NEXT: v_mov_b32_e32 v22, s15 +; SI-NEXT: v_mov_b32_e32 v23, s14 +; SI-NEXT: v_mov_b32_e32 v24, s13 +; SI-NEXT: v_mov_b32_e32 v25, s12 +; SI-NEXT: v_mov_b32_e32 v26, s11 +; SI-NEXT: v_mov_b32_e32 v27, s10 +; SI-NEXT: v_mov_b32_e32 v28, s9 +; SI-NEXT: v_mov_b32_e32 v29, s8 +; SI-NEXT: v_mov_b32_e32 v30, s7 +; SI-NEXT: v_mov_b32_e32 v31, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB65_4: +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB65_2 +; +; VI-LABEL: bitcast_v8i64_to_v32bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB65_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB65_3 +; VI-NEXT: .LBB65_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB65_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB65_4: +; VI-NEXT: s_branch .LBB65_2 +; +; GFX9-LABEL: bitcast_v8i64_to_v32bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB65_3 +; GFX9-NEXT: .LBB65_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB65_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB65_4: +; GFX9-NEXT: s_branch .LBB65_2 +; +; GFX11-LABEL: bitcast_v8i64_to_v32bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB65_3 +; GFX11-NEXT: .LBB65_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB65_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB65_4: +; GFX11-NEXT: s_branch .LBB65_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v32bf16_to_v8i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v46 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB33_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v42 -; GCN-NEXT: v_alignbit_b32 v0, v0, v45, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v43, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_alignbit_b32 v2, v2, v51, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v49, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v39, 16 -; GCN-NEXT: v_alignbit_b32 v5, v5, v37, 16 -; GCN-NEXT: v_alignbit_b32 v6, v6, v36, 16 -; GCN-NEXT: v_alignbit_b32 v7, v7, v34, 16 -; GCN-NEXT: v_alignbit_b32 v8, v8, v33, 16 -; GCN-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; GCN-NEXT: v_alignbit_b32 v10, v10, v31, 16 -; GCN-NEXT: v_alignbit_b32 v11, v11, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v12, v19, 16 -; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; GCN-NEXT: v_alignbit_b32 v14, v14, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB33_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB33_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v31 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v10, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v11, v29, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v25, v19, 16 -; GCN-NEXT: v_alignbit_b32 v13, v23, v18, 16 -; GCN-NEXT: v_alignbit_b32 v14, v22, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v20, v16, 16 -; GCN-NEXT: .LBB33_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32bf16_to_v8i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: .LBB66_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: .LBB66_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v8i64: ; VI: ; %bb.0: @@ -17300,7 +35281,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB33_2 +; VI-NEXT: s_cbranch_execz .LBB66_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -17591,7 +35572,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 -; VI-NEXT: .LBB33_2: ; %end +; VI-NEXT: .LBB66_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -17602,7 +35583,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB33_2 +; GFX9-NEXT: s_cbranch_execz .LBB66_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -17846,7 +35827,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc ; GFX9-NEXT: v_perm_b32 v0, v0, v16, s7 -; GFX9-NEXT: .LBB33_2: ; %end +; GFX9-NEXT: .LBB66_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -17858,7 +35839,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -18132,7 +36113,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v22, v25, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v20, v0 -; GFX11-TRUE16-NEXT: .LBB33_2: ; %end +; GFX11-TRUE16-NEXT: .LBB66_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18144,7 +36125,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v15 @@ -18422,7 +36403,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v21, v26, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB33_2: ; %end +; GFX11-FAKE16-NEXT: .LBB66_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -18442,413 +36423,1657 @@ end: ret <8 x i64> %phi } +define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32bf16_to_v8i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: s_cbranch_scc0 .LBB67_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: s_cbranch_execnz .LBB67_3 +; SI-NEXT: .LBB67_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB67_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB67_2 +; +; VI-LABEL: bitcast_v32bf16_to_v8i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s31, v1 +; VI-NEXT: s_cbranch_scc0 .LBB67_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB67_4 +; VI-NEXT: .LBB67_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_branch .LBB67_5 +; VI-NEXT: .LBB67_3: +; VI-NEXT: s_branch .LBB67_2 +; VI-NEXT: .LBB67_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: .LBB67_5: ; %end +; VI-NEXT: v_readlane_b32 s31, v19, 1 +; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v8i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s31, v1 +; GFX9-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB67_4 +; GFX9-NEXT: .LBB67_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s31, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 +; GFX9-NEXT: s_branch .LBB67_5 +; GFX9-NEXT: .LBB67_3: +; GFX9-NEXT: s_branch .LBB67_2 +; GFX9-NEXT: .LBB67_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: .LBB67_5: ; %end +; GFX9-NEXT: v_readlane_b32 s31, v20, 1 +; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32bf16_to_v8i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB67_4 +; GFX11-NEXT: .LBB67_2: ; %cmp.true +; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s2, s26, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_and_b32 s1, s25, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s3, s25, 16 +; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s24, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s23, 16 +; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_lshl_b32 s1, s22, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s1, s21, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s20, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v8, v7 +; GFX11-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v16, v4, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v4 +; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s1, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v0 +; GFX11-NEXT: s_and_b32 s1, s17, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v6, v4 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v17, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v19, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_lshl_b32 s1, s16, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo +; GFX11-NEXT: v_bfe_u32 v17, v18, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v16, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v17, v18 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v19, v6, v16 +; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v16 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_lshl_b32 s1, s15, 16 +; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v3, v17, 16, 1 +; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v17 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX11-NEXT: v_bfe_u32 v19, v4, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4 +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s13, 16 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_lshl_b32 s0, s12, 16 +; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v21 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v19, v21, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v23, v17, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v18 +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v17 +; GFX11-NEXT: v_bfe_u32 v24, v22, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v18, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v2, v16, 16, v21 +; GFX11-NEXT: v_lshl_or_b32 v0, v20, 16, v17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB67_3: +; GFX11-NEXT: s_branch .LBB67_2 +; GFX11-NEXT: .LBB67_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i64_to_v64i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 -; GCN-NEXT: v_alignbit_b32 v23, v12, v11, 24 -; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 -; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v50, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GCN-NEXT: .LBB34_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 -; GCN-NEXT: v_alignbit_b32 v23, v12, v11, 24 -; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 -; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v50, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GCN-NEXT: .LBB34_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v60 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_or_b32_e32 v60, v1, v18 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v17 -; GCN-NEXT: v_or_b32_e32 v17, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v58 -; GCN-NEXT: v_or_b32_e32 v58, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v46 -; GCN-NEXT: v_or_b32_e32 v46, v1, v2 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v44 -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 24, v61 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v59 -; GCN-NEXT: v_or_b32_e32 v44, v1, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v40 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 -; GCN-NEXT: v_or_b32_e32 v7, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 24, v56 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v47 -; GCN-NEXT: v_or_b32_e32 v54, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v49 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v48 -; GCN-NEXT: v_or_b32_e32 v9, v4, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v42 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v55 -; GCN-NEXT: v_or_b32_e32 v48, v5, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v34 -; GCN-NEXT: v_or_b32_e32 v34, v6, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v52 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v51 -; GCN-NEXT: v_or_b32_e32 v11, v8, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 24, v30 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v28 -; GCN-NEXT: v_or_b32_e32 v13, v10, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 24, v38 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v35 -; GCN-NEXT: v_or_b32_e32 v28, v12, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v23 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v21 -; GCN-NEXT: v_or_b32_e32 v21, v14, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v32 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v29 -; GCN-NEXT: v_or_b32_e32 v29, v15, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v52, 0xff, v20 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GCN-NEXT: v_or_b32_e32 v32, v19, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v45 -; GCN-NEXT: v_or_b32_e32 v27, v61, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v19 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v41 -; GCN-NEXT: v_or_b32_e32 v40, v40, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 24, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v57 -; GCN-NEXT: v_or_b32_e32 v56, v56, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v60 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v46 -; GCN-NEXT: v_or_b32_e32 v49, v49, v47 -; GCN-NEXT: v_and_b32_e32 v44, 0xffff, v44 -; GCN-NEXT: v_or_b32_e32 v50, v50, v43 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v36, v36, v55 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v37, v37, v53 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v30, v30, v51 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v31, v31, v39 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GCN-NEXT: v_or_b32_e32 v23, v23, v35 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v25, v25, v38 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v20, v20, v52 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v26, v26, v42 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v33, v33, v45 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v24, v41, v24 -; GCN-NEXT: v_or_b32_e32 v32, v57, v32 -; GCN-NEXT: v_or_b32_e32 v17, v17, v27 -; GCN-NEXT: v_or_b32_e32 v27, v58, v40 -; GCN-NEXT: v_or_b32_e32 v35, v59, v56 -; GCN-NEXT: v_or_b32_e32 v38, v46, v49 -; GCN-NEXT: v_or_b32_e32 v39, v44, v50 -; GCN-NEXT: v_or_b32_e32 v7, v7, v36 -; GCN-NEXT: v_or_b32_e32 v36, v54, v37 -; GCN-NEXT: v_or_b32_e32 v9, v9, v30 -; GCN-NEXT: v_or_b32_e32 v30, v48, v31 -; GCN-NEXT: v_or_b32_e32 v23, v34, v23 -; GCN-NEXT: v_or_b32_e32 v11, v11, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v20 -; GCN-NEXT: v_or_b32_e32 v20, v28, v26 -; GCN-NEXT: v_or_b32_e32 v21, v21, v33 -; GCN-NEXT: v_or_b32_e32 v24, v29, v24 -; GCN-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i64_to_v64i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 8 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v24, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 8 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v30, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v36, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v51, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v40, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v6 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; SI-NEXT: .LBB68_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 8 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v24, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 8 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v30, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v36, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v51, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v40, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v6 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; SI-NEXT: .LBB68_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v46 +; SI-NEXT: v_or_b32_e32 v46, v46, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v46 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v17, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v64i8: ; VI: ; %bb.0: @@ -18922,7 +38147,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: s_cbranch_execz .LBB68_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -18975,9 +38200,9 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] -; VI-NEXT: .LBB34_2: ; %Flow +; VI-NEXT: .LBB68_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB34_4 +; VI-NEXT: s_cbranch_execz .LBB68_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc @@ -19045,7 +38270,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: .LBB34_4: ; %end +; VI-NEXT: .LBB68_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 @@ -19253,7 +38478,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB34_2 +; GFX9-NEXT: s_cbranch_execz .LBB68_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -19306,9 +38531,9 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: .LBB34_2: ; %Flow +; GFX9-NEXT: .LBB68_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB34_4 +; GFX9-NEXT: s_cbranch_execz .LBB68_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 3, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc @@ -19376,7 +38601,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB34_4: ; %end +; GFX9-NEXT: .LBB68_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 @@ -19536,7 +38761,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB68_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -19570,9 +38795,9 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB34_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB68_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB68_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v1, vcc_lo, v1, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -19627,7 +38852,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB34_4: ; %end +; GFX11-TRUE16-NEXT: .LBB68_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -19839,7 +39064,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB68_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -19889,9 +39114,9 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB34_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB68_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB68_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v1, vcc_lo, v1, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -19962,7 +39187,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB34_4: ; %end +; GFX11-FAKE16-NEXT: .LBB68_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -20132,606 +39357,2577 @@ end: ret <64 x i8> %phi } +define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i64_to_v64i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v21, s18 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s29, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s29, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s29, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s27, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s27, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s27, v9, 8 +; SI-NEXT: v_alignbit_b32 v13, s25, v10, 24 +; SI-NEXT: v_alignbit_b32 v15, s25, v10, 16 +; SI-NEXT: v_alignbit_b32 v10, s25, v10, 8 +; SI-NEXT: v_alignbit_b32 v11, s23, v14, 24 +; SI-NEXT: v_alignbit_b32 v12, s23, v14, 16 +; SI-NEXT: v_alignbit_b32 v14, s23, v14, 8 +; SI-NEXT: v_alignbit_b32 v16, s21, v18, 24 +; SI-NEXT: v_alignbit_b32 v17, s21, v18, 16 +; SI-NEXT: v_alignbit_b32 v18, s21, v18, 8 +; SI-NEXT: v_alignbit_b32 v19, s19, v21, 24 +; SI-NEXT: v_alignbit_b32 v20, s19, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s19, v21, 8 +; SI-NEXT: v_alignbit_b32 v23, s17, v22, 24 +; SI-NEXT: v_alignbit_b32 v24, s17, v22, 16 +; SI-NEXT: v_alignbit_b32 v22, s17, v22, 8 +; SI-NEXT: s_lshr_b32 s8, s6, 24 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 8 +; SI-NEXT: s_lshr_b32 s11, s29, 24 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s29, 8 +; SI-NEXT: s_lshr_b32 s14, s27, 24 +; SI-NEXT: s_lshr_b32 s15, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 8 +; SI-NEXT: s_lshr_b32 s41, s25, 24 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 8 +; SI-NEXT: s_lshr_b32 s44, s23, 24 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: s_lshr_b32 s47, s21, 24 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 8 +; SI-NEXT: s_lshr_b32 s58, s19, 24 +; SI-NEXT: s_lshr_b32 s59, s19, 16 +; SI-NEXT: s_lshr_b32 s60, s19, 8 +; SI-NEXT: s_lshr_b32 s61, s17, 24 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: .LBB69_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v21, s18 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s29, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s29, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s29, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s27, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s27, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s27, v9, 8 +; SI-NEXT: v_alignbit_b32 v13, s25, v10, 24 +; SI-NEXT: v_alignbit_b32 v15, s25, v10, 16 +; SI-NEXT: v_alignbit_b32 v10, s25, v10, 8 +; SI-NEXT: v_alignbit_b32 v11, s23, v14, 24 +; SI-NEXT: v_alignbit_b32 v12, s23, v14, 16 +; SI-NEXT: v_alignbit_b32 v14, s23, v14, 8 +; SI-NEXT: v_alignbit_b32 v16, s21, v18, 24 +; SI-NEXT: v_alignbit_b32 v17, s21, v18, 16 +; SI-NEXT: v_alignbit_b32 v18, s21, v18, 8 +; SI-NEXT: v_alignbit_b32 v19, s19, v21, 24 +; SI-NEXT: v_alignbit_b32 v20, s19, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s19, v21, 8 +; SI-NEXT: v_alignbit_b32 v23, s17, v22, 24 +; SI-NEXT: v_alignbit_b32 v24, s17, v22, 16 +; SI-NEXT: v_alignbit_b32 v22, s17, v22, 8 +; SI-NEXT: s_lshr_b32 s8, s6, 24 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 8 +; SI-NEXT: s_lshr_b32 s11, s29, 24 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s29, 8 +; SI-NEXT: s_lshr_b32 s14, s27, 24 +; SI-NEXT: s_lshr_b32 s15, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 8 +; SI-NEXT: s_lshr_b32 s41, s25, 24 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 8 +; SI-NEXT: s_lshr_b32 s44, s23, 24 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: s_lshr_b32 s47, s21, 24 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 8 +; SI-NEXT: s_lshr_b32 s58, s19, 24 +; SI-NEXT: s_lshr_b32 s59, s19, 16 +; SI-NEXT: s_lshr_b32 s60, s19, 8 +; SI-NEXT: s_lshr_b32 s61, s17, 24 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: .LBB69_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; SI-NEXT: v_or_b32_e32 v22, s4, v22 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s63, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s62, 0xff +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s61, 24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; SI-NEXT: v_or_b32_e32 v21, s4, v21 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s60, 8 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s59, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s58, 24 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v20, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_or_b32_e32 v18, s4, v18 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s57, 8 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s56, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s47, 24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s45, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s44, 24 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s5, s43, 8 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xff, v15 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s42, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s41, 24 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s27, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s15, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s14, s14, 24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s13, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s12, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s11, s11, 24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s11, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s9, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s8, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB69_4: +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB69_2 +; +; VI-LABEL: bitcast_v8i64_to_v64i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v4, s30, 0 +; VI-NEXT: v_writelane_b32 v4, s31, 1 +; VI-NEXT: v_writelane_b32 v4, s34, 2 +; VI-NEXT: v_writelane_b32 v4, s35, 3 +; VI-NEXT: v_writelane_b32 v4, s36, 4 +; VI-NEXT: v_writelane_b32 v4, s37, 5 +; VI-NEXT: v_writelane_b32 v4, s38, 6 +; VI-NEXT: v_writelane_b32 v4, s39, 7 +; VI-NEXT: v_writelane_b32 v4, s48, 8 +; VI-NEXT: v_writelane_b32 v4, s49, 9 +; VI-NEXT: v_writelane_b32 v4, s50, 10 +; VI-NEXT: v_writelane_b32 v4, s51, 11 +; VI-NEXT: v_writelane_b32 v4, s52, 12 +; VI-NEXT: v_writelane_b32 v4, s53, 13 +; VI-NEXT: v_writelane_b32 v4, s54, 14 +; VI-NEXT: v_writelane_b32 v4, s55, 15 +; VI-NEXT: v_writelane_b32 v4, s64, 16 +; VI-NEXT: v_writelane_b32 v4, s65, 17 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_writelane_b32 v4, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: v_writelane_b32 v4, s67, 19 +; VI-NEXT: s_cbranch_scc0 .LBB69_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s59, s4, 16 +; VI-NEXT: s_lshr_b32 s60, s4, 8 +; VI-NEXT: s_lshr_b32 s61, s29, 24 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s29, 8 +; VI-NEXT: s_lshr_b32 s72, s28, 16 +; VI-NEXT: s_lshr_b32 s73, s28, 8 +; VI-NEXT: s_lshr_b32 s74, s27, 24 +; VI-NEXT: s_lshr_b32 s75, s27, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 8 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 8 +; VI-NEXT: s_lshr_b32 s79, s25, 24 +; VI-NEXT: s_lshr_b32 s88, s25, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 8 +; VI-NEXT: s_lshr_b32 s90, s24, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 8 +; VI-NEXT: s_lshr_b32 s30, s23, 24 +; VI-NEXT: s_lshr_b32 s31, s23, 16 +; VI-NEXT: s_lshr_b32 s34, s23, 8 +; VI-NEXT: s_lshr_b32 s35, s22, 16 +; VI-NEXT: s_lshr_b32 s36, s22, 8 +; VI-NEXT: s_lshr_b32 s37, s21, 24 +; VI-NEXT: s_lshr_b32 s38, s21, 16 +; VI-NEXT: s_lshr_b32 s39, s21, 8 +; VI-NEXT: s_lshr_b32 s48, s20, 16 +; VI-NEXT: s_lshr_b32 s49, s20, 8 +; VI-NEXT: s_lshr_b32 s50, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s55, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s65, s17, 8 +; VI-NEXT: s_lshr_b32 s66, s16, 16 +; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB69_3 +; VI-NEXT: .LBB69_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s4, s4, 3 +; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s59, s4, 16 +; VI-NEXT: s_lshr_b32 s60, s4, 8 +; VI-NEXT: s_lshr_b32 s61, s29, 24 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s29, 8 +; VI-NEXT: s_lshr_b32 s72, s28, 16 +; VI-NEXT: s_lshr_b32 s73, s28, 8 +; VI-NEXT: s_lshr_b32 s74, s27, 24 +; VI-NEXT: s_lshr_b32 s75, s27, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 8 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 8 +; VI-NEXT: s_lshr_b32 s79, s25, 24 +; VI-NEXT: s_lshr_b32 s88, s25, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 8 +; VI-NEXT: s_lshr_b32 s90, s24, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 8 +; VI-NEXT: s_lshr_b32 s30, s23, 24 +; VI-NEXT: s_lshr_b32 s31, s23, 16 +; VI-NEXT: s_lshr_b32 s34, s23, 8 +; VI-NEXT: s_lshr_b32 s35, s22, 16 +; VI-NEXT: s_lshr_b32 s36, s22, 8 +; VI-NEXT: s_lshr_b32 s37, s21, 24 +; VI-NEXT: s_lshr_b32 s38, s21, 16 +; VI-NEXT: s_lshr_b32 s39, s21, 8 +; VI-NEXT: s_lshr_b32 s48, s20, 16 +; VI-NEXT: s_lshr_b32 s49, s20, 8 +; VI-NEXT: s_lshr_b32 s50, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s55, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s65, s17, 8 +; VI-NEXT: s_lshr_b32 s66, s16, 16 +; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: .LBB69_3: ; %end +; VI-NEXT: s_and_b32 s7, s16, 0xff +; VI-NEXT: s_lshl_b32 s9, s67, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s66, 0xff +; VI-NEXT: s_lshl_b32 s11, s44, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_and_b32 s7, s17, 0xff +; VI-NEXT: s_lshl_b32 s9, s65, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s64, 0xff +; VI-NEXT: s_lshl_b32 s11, s55, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s54, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s53, 0xff +; VI-NEXT: s_lshl_b32 s11, s42, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s19, 0xff +; VI-NEXT: s_lshl_b32 s9, s52, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s51, 0xff +; VI-NEXT: s_lshl_b32 s11, s50, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s49, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s48, 0xff +; VI-NEXT: s_lshl_b32 s11, s40, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s21, 0xff +; VI-NEXT: s_lshl_b32 s9, s39, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s38, 0xff +; VI-NEXT: s_lshl_b32 s11, s37, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s9, s36, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s35, 0xff +; VI-NEXT: s_lshl_b32 s11, s14, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s23, 0xff +; VI-NEXT: s_lshl_b32 s9, s34, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s31, 0xff +; VI-NEXT: s_lshl_b32 s11, s30, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_lshl_b32 s9, s91, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s90, 0xff +; VI-NEXT: s_lshl_b32 s11, s12, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s25, 0xff +; VI-NEXT: s_lshl_b32 s9, s89, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s88, 0xff +; VI-NEXT: s_lshl_b32 s11, s79, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s9, s78, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s77, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s27, 0xff +; VI-NEXT: s_lshl_b32 s9, s76, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s75, 0xff +; VI-NEXT: s_lshl_b32 s10, s74, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s9, s73, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s72, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s29, 0xff +; VI-NEXT: s_lshl_b32 s8, s63, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s62, 0xff +; VI-NEXT: s_lshl_b32 s9, s61, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s7, s60, 8 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s59, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: s_lshl_b32 s5, s58, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s57, 0xff +; VI-NEXT: s_lshl_b32 s6, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s67, v4, 19 +; VI-NEXT: v_readlane_b32 s66, v4, 18 +; VI-NEXT: v_readlane_b32 s65, v4, 17 +; VI-NEXT: v_readlane_b32 s64, v4, 16 +; VI-NEXT: v_readlane_b32 s55, v4, 15 +; VI-NEXT: v_readlane_b32 s54, v4, 14 +; VI-NEXT: v_readlane_b32 s53, v4, 13 +; VI-NEXT: v_readlane_b32 s52, v4, 12 +; VI-NEXT: v_readlane_b32 s51, v4, 11 +; VI-NEXT: v_readlane_b32 s50, v4, 10 +; VI-NEXT: v_readlane_b32 s49, v4, 9 +; VI-NEXT: v_readlane_b32 s48, v4, 8 +; VI-NEXT: v_readlane_b32 s39, v4, 7 +; VI-NEXT: v_readlane_b32 s38, v4, 6 +; VI-NEXT: v_readlane_b32 s37, v4, 5 +; VI-NEXT: v_readlane_b32 s36, v4, 4 +; VI-NEXT: v_readlane_b32 s35, v4, 3 +; VI-NEXT: v_readlane_b32 s34, v4, 2 +; VI-NEXT: v_readlane_b32 s31, v4, 1 +; VI-NEXT: v_readlane_b32 s30, v4, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB69_4: +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: s_branch .LBB69_2 +; +; GFX9-LABEL: bitcast_v8i64_to_v64i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v4, s30, 0 +; GFX9-NEXT: v_writelane_b32 v4, s31, 1 +; GFX9-NEXT: v_writelane_b32 v4, s34, 2 +; GFX9-NEXT: v_writelane_b32 v4, s35, 3 +; GFX9-NEXT: v_writelane_b32 v4, s36, 4 +; GFX9-NEXT: v_writelane_b32 v4, s37, 5 +; GFX9-NEXT: v_writelane_b32 v4, s38, 6 +; GFX9-NEXT: v_writelane_b32 v4, s39, 7 +; GFX9-NEXT: v_writelane_b32 v4, s48, 8 +; GFX9-NEXT: v_writelane_b32 v4, s49, 9 +; GFX9-NEXT: v_writelane_b32 v4, s50, 10 +; GFX9-NEXT: v_writelane_b32 v4, s51, 11 +; GFX9-NEXT: v_writelane_b32 v4, s52, 12 +; GFX9-NEXT: v_writelane_b32 v4, s53, 13 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_writelane_b32 v4, s54, 14 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: v_writelane_b32 v4, s55, 15 +; GFX9-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s58, s5, 8 +; GFX9-NEXT: s_lshr_b32 s59, s4, 16 +; GFX9-NEXT: s_lshr_b32 s60, s4, 8 +; GFX9-NEXT: s_lshr_b32 s61, s29, 24 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s29, 8 +; GFX9-NEXT: s_lshr_b32 s72, s28, 16 +; GFX9-NEXT: s_lshr_b32 s73, s28, 8 +; GFX9-NEXT: s_lshr_b32 s74, s27, 24 +; GFX9-NEXT: s_lshr_b32 s75, s27, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 8 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s26, 8 +; GFX9-NEXT: s_lshr_b32 s79, s25, 24 +; GFX9-NEXT: s_lshr_b32 s88, s25, 16 +; GFX9-NEXT: s_lshr_b32 s89, s25, 8 +; GFX9-NEXT: s_lshr_b32 s90, s24, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 8 +; GFX9-NEXT: s_lshr_b32 s92, s23, 24 +; GFX9-NEXT: s_lshr_b32 s93, s23, 16 +; GFX9-NEXT: s_lshr_b32 s94, s23, 8 +; GFX9-NEXT: s_lshr_b32 s95, s22, 16 +; GFX9-NEXT: s_lshr_b32 s30, s22, 8 +; GFX9-NEXT: s_lshr_b32 s31, s21, 24 +; GFX9-NEXT: s_lshr_b32 s34, s21, 16 +; GFX9-NEXT: s_lshr_b32 s35, s21, 8 +; GFX9-NEXT: s_lshr_b32 s36, s20, 16 +; GFX9-NEXT: s_lshr_b32 s37, s20, 8 +; GFX9-NEXT: s_lshr_b32 s38, s19, 24 +; GFX9-NEXT: s_lshr_b32 s39, s19, 16 +; GFX9-NEXT: s_lshr_b32 s48, s19, 8 +; GFX9-NEXT: s_lshr_b32 s49, s18, 16 +; GFX9-NEXT: s_lshr_b32 s50, s18, 8 +; GFX9-NEXT: s_lshr_b32 s51, s17, 24 +; GFX9-NEXT: s_lshr_b32 s52, s17, 16 +; GFX9-NEXT: s_lshr_b32 s53, s17, 8 +; GFX9-NEXT: s_lshr_b32 s54, s16, 16 +; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB69_3 +; GFX9-NEXT: .LBB69_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s4, s4, 3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s58, s5, 8 +; GFX9-NEXT: s_lshr_b32 s59, s4, 16 +; GFX9-NEXT: s_lshr_b32 s60, s4, 8 +; GFX9-NEXT: s_lshr_b32 s61, s29, 24 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s29, 8 +; GFX9-NEXT: s_lshr_b32 s72, s28, 16 +; GFX9-NEXT: s_lshr_b32 s73, s28, 8 +; GFX9-NEXT: s_lshr_b32 s74, s27, 24 +; GFX9-NEXT: s_lshr_b32 s75, s27, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 8 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s26, 8 +; GFX9-NEXT: s_lshr_b32 s79, s25, 24 +; GFX9-NEXT: s_lshr_b32 s88, s25, 16 +; GFX9-NEXT: s_lshr_b32 s89, s25, 8 +; GFX9-NEXT: s_lshr_b32 s90, s24, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 8 +; GFX9-NEXT: s_lshr_b32 s92, s23, 24 +; GFX9-NEXT: s_lshr_b32 s93, s23, 16 +; GFX9-NEXT: s_lshr_b32 s94, s23, 8 +; GFX9-NEXT: s_lshr_b32 s95, s22, 16 +; GFX9-NEXT: s_lshr_b32 s30, s22, 8 +; GFX9-NEXT: s_lshr_b32 s31, s21, 24 +; GFX9-NEXT: s_lshr_b32 s34, s21, 16 +; GFX9-NEXT: s_lshr_b32 s35, s21, 8 +; GFX9-NEXT: s_lshr_b32 s36, s20, 16 +; GFX9-NEXT: s_lshr_b32 s37, s20, 8 +; GFX9-NEXT: s_lshr_b32 s38, s19, 24 +; GFX9-NEXT: s_lshr_b32 s39, s19, 16 +; GFX9-NEXT: s_lshr_b32 s48, s19, 8 +; GFX9-NEXT: s_lshr_b32 s49, s18, 16 +; GFX9-NEXT: s_lshr_b32 s50, s18, 8 +; GFX9-NEXT: s_lshr_b32 s51, s17, 24 +; GFX9-NEXT: s_lshr_b32 s52, s17, 16 +; GFX9-NEXT: s_lshr_b32 s53, s17, 8 +; GFX9-NEXT: s_lshr_b32 s54, s16, 16 +; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: .LBB69_3: ; %end +; GFX9-NEXT: s_and_b32 s7, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s55, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s54, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s44, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s53, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s51, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s50, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s49, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s42, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s48, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s38, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s37, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s36, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s40, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s35, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s34, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s31, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s30, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s95, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s14, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s94, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s93, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s92, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s91, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s90, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s12, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s89, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s88, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s79, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s78, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s76, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s75, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s74, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s73, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s72, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s63, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s62, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s61, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s60, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s7 +; GFX9-NEXT: s_and_b32 s7, s59, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_and_b32 s4, s5, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s58, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s57, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s56, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_readlane_b32 s55, v4, 15 +; GFX9-NEXT: v_readlane_b32 s54, v4, 14 +; GFX9-NEXT: v_readlane_b32 s53, v4, 13 +; GFX9-NEXT: v_readlane_b32 s52, v4, 12 +; GFX9-NEXT: v_readlane_b32 s51, v4, 11 +; GFX9-NEXT: v_readlane_b32 s50, v4, 10 +; GFX9-NEXT: v_readlane_b32 s49, v4, 9 +; GFX9-NEXT: v_readlane_b32 s48, v4, 8 +; GFX9-NEXT: v_readlane_b32 s39, v4, 7 +; GFX9-NEXT: v_readlane_b32 s38, v4, 6 +; GFX9-NEXT: v_readlane_b32 s37, v4, 5 +; GFX9-NEXT: v_readlane_b32 s36, v4, 4 +; GFX9-NEXT: v_readlane_b32 s35, v4, 3 +; GFX9-NEXT: v_readlane_b32 s34, v4, 2 +; GFX9-NEXT: v_readlane_b32 s31, v4, 1 +; GFX9-NEXT: v_readlane_b32 s30, v4, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB69_4: +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: s_branch .LBB69_2 +; +; GFX11-TRUE16-LABEL: bitcast_v8i64_to_v64i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v17, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s30, 0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s49, 9 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB69_3 +; GFX11-TRUE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 8 +; GFX11-TRUE16-NEXT: .LBB69_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s49 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s48 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s40 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s39 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s38 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s37 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s36 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s35 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s34 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s31 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s30 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s95 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s94 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s14 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s93 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s92 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s91 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s90 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s89 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s12 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s88 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s79 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s78 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s75 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s73 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s72 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s63 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s62 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s61 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s60 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s59 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s58 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s57 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s47 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s45 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s42 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v17, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v17, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v17, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v17, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v17, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v17, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v17, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v17, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v17, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v17, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v17, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB69_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr95_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB69_2 +; +; GFX11-FAKE16-LABEL: bitcast_v8i64_to_v64i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v17, s32 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s30, 0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s48, 8 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB69_3 +; GFX11-FAKE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s0, 8 +; GFX11-FAKE16-NEXT: .LBB69_3: ; %end +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s48, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s39, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s40, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s38, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s37, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s36, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s35, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s34, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s28, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s31, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s30, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, vcc_hi, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s95, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s94, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s14, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s93, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s92, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s91, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s90, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s89, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s12, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s19, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s88, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s79, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s78, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s77, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s76, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s10, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s75, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s74, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s73, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s72, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s63, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s8, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s62, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s61, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s60, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s59, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s58, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s6, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s25, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s57, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s56, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s47, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s46, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s45, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s27, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s44, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s43, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s42, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v17, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v17, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v17, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v17, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v17, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v17, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v17, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v17, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v17, 0 +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v17, off, s32 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB69_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: s_branch .LBB69_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i8_to_v8i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v34, v12 -; GCN-NEXT: v_mov_b32_e32 v37, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v36, v6 -; GCN-NEXT: v_mov_b32_e32 v32, v4 -; GCN-NEXT: v_mov_b32_e32 v35, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v44 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 24, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 24, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 24, v46 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB35_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_or_b32_e32 v0, v0, v42 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_or_b32_e32 v1, v1, v41 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v40 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_or_b32_e32 v3, v3, v55 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v37 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v38 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v50 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v54 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v53 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v52 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v49 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v48 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v39 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v63 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v62 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v61 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32 -; GCN-NEXT: v_or_b32_e32 v17, v33, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v34 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v28, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; GCN-NEXT: v_or_b32_e32 v27, v35, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v29, v36, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v37 -; GCN-NEXT: v_or_b32_e32 v31, v38, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v18, v18, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v25, v59, v25 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v30, v57, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v32, v58, v8 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v9 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v8, v10 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v11 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v8, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v8, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v8, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v19, v46, v19 -; GCN-NEXT: v_or_b32_e32 v0, v0, v7 -; GCN-NEXT: v_or_b32_e32 v1, v1, v6 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: v_or_b32_e32 v4, v17, v21 -; GCN-NEXT: v_or_b32_e32 v5, v20, v22 -; GCN-NEXT: v_or_b32_e32 v6, v23, v24 -; GCN-NEXT: v_or_b32_e32 v7, v26, v28 -; GCN-NEXT: v_or_b32_e32 v8, v27, v25 -; GCN-NEXT: v_or_b32_e32 v9, v29, v30 -; GCN-NEXT: v_or_b32_e32 v10, v31, v32 -; GCN-NEXT: v_or_b32_e32 v11, v33, v34 -; GCN-NEXT: v_or_b32_e32 v12, v35, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v16 -; GCN-NEXT: v_or_b32_e32 v15, v18, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; kill: killed $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: .LBB35_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB35_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v41, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v40, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v55, v3 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v50 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v51 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GCN-NEXT: v_or_b32_e32 v6, v17, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v29, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v27, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GCN-NEXT: v_or_b32_e32 v17, v44, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; GCN-NEXT: v_or_b32_e32 v20, v45, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GCN-NEXT: v_or_b32_e32 v23, v56, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v30 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v32 -; GCN-NEXT: v_or_b32_e32 v29, v43, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v19, v47, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v34 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v31, v36 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_or_b32_e32 v16, v59, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 -; GCN-NEXT: v_or_b32_e32 v18, v57, v18 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: v_or_b32_e32 v22, v58, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x300, v19 -; GCN-NEXT: v_or_b32_e32 v30, v46, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v0, v31, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v10, v9 -; GCN-NEXT: v_or_b32_e32 v6, v12, v11 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v16, v15 -; GCN-NEXT: v_or_b32_e32 v9, v18, v17 -; GCN-NEXT: v_or_b32_e32 v10, v22, v20 -; GCN-NEXT: v_or_b32_e32 v11, v24, v23 -; GCN-NEXT: v_or_b32_e32 v12, v26, v25 -; GCN-NEXT: v_or_b32_e32 v13, v28, v27 -; GCN-NEXT: v_or_b32_e32 v14, v21, v29 -; GCN-NEXT: v_or_b32_e32 v15, v30, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 -; GCN-NEXT: .LBB35_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i8_to_v8i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v8 +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v50 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v41 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v45 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v56 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v13 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v53, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v23, v15 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v8, v8, v62 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v59 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: .LBB70_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v53, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v23, v15 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v17, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v30, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v40, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v13, v50, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v25, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v21, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 +; SI-NEXT: .LBB70_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v8i64: ; VI: ; %bb.0: @@ -20845,7 +42041,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB35_2 +; VI-NEXT: s_cbranch_execz .LBB70_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -21009,9 +42205,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: .LBB35_2: ; %Flow +; VI-NEXT: .LBB70_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB35_4 +; VI-NEXT: s_cbranch_execz .LBB70_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -21160,7 +42356,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: .LBB35_4: ; %end +; VI-NEXT: .LBB70_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -21305,7 +42501,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB35_2 +; GFX9-NEXT: s_cbranch_execz .LBB70_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -21469,9 +42665,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr21 -; GFX9-NEXT: .LBB35_2: ; %Flow +; GFX9-NEXT: .LBB70_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB35_4 +; GFX9-NEXT: s_cbranch_execz .LBB70_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -21620,7 +42816,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: .LBB35_4: ; %end +; GFX9-NEXT: .LBB70_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -21747,15 +42943,15 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-TRUE16-NEXT: .LBB35_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-TRUE16-NEXT: .LBB70_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h @@ -21937,8 +43133,8 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-TRUE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 @@ -22217,15 +43413,15 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-FAKE16-NEXT: .LBB35_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-FAKE16-NEXT: .LBB70_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v37 @@ -22404,8 +43600,8 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-FAKE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-FAKE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -22602,108 +43798,2211 @@ end: ret <8 x i64> %phi } +define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i8_to_v8i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v51 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v30 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v42 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v44 +; SI-NEXT: s_cbranch_scc0 .LBB71_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_mov_b32_e32 v61, v57 +; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v24, v26 +; SI-NEXT: v_mov_b32_e32 v26, v28 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v23, v21 +; SI-NEXT: v_mov_b32_e32 v21, v19 +; SI-NEXT: v_mov_b32_e32 v19, v17 +; SI-NEXT: v_mov_b32_e32 v17, v13 +; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v27, v42 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB71_3 +; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB71_4: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v27, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v24, v26 +; SI-NEXT: v_mov_b32_e32 v26, v28 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v61, v57 +; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v23, v21 +; SI-NEXT: v_mov_b32_e32 v21, v19 +; SI-NEXT: v_mov_b32_e32 v19, v17 +; SI-NEXT: v_mov_b32_e32 v17, v13 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB71_2 +; +; VI-LABEL: bitcast_v64i8_to_v8i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v48, v14 +; VI-NEXT: v_mov_b32_e32 v49, v13 +; VI-NEXT: v_mov_b32_e32 v50, v12 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v31, v8 +; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v27 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v2 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v6 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB71_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; VI-NEXT: v_or_b32_sdwa v0, v39, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v42, v7 +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v8 +; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v26, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v10 +; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v56 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v11 +; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v19, v52 +; VI-NEXT: v_mov_b32_e32 v52, v53 +; VI-NEXT: v_mov_b32_e32 v53, v12 +; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v23, v13 +; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v47, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_mov_b32_e32 v27, v3 +; VI-NEXT: v_or_b32_sdwa v2, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB71_3 +; VI-NEXT: .LBB71_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v48 +; VI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; VI-NEXT: v_or_b32_sdwa v17, v42, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 +; VI-NEXT: v_or_b32_e32 v7, v7, v16 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v7, v7, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v20 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 +; VI-NEXT: v_or_b32_sdwa v16, v41, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v24 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 +; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v40, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v36 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v28 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v51, vcc, 3, v57 +; VI-NEXT: v_lshlrev_b32_e32 v10, 24, v29 +; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v55, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v51 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 +; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v46 +; VI-NEXT: v_lshlrev_b32_e32 v11, 24, v56 +; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v54, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v39 +; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v38 +; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v62 +; VI-NEXT: v_lshlrev_b32_e32 v12, 24, v45 +; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v53, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v37 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v36 +; VI-NEXT: v_lshlrev_b32_e32 v13, 24, v61 +; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v23, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; VI-NEXT: v_lshlrev_b32_e32 v14, 24, v59 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s9, s17, 8 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s18, 0xff +; VI-NEXT: s_lshl_b32 s8, s19, 24 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s21, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s9, s20, 0xff +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s22, 0xff +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s6, s23, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s9 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s5, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v31 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v47 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v34 +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v63 +; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v49 +; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: v_or_b32_sdwa v31, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_or_b32_sdwa v25, v52, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x300, v31 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v16, v16, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v5, v5, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v25 +; VI-NEXT: v_or_b32_e32 v6, v6, v21 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s8, s8, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: .LBB71_3: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB71_4: +; VI-NEXT: v_mov_b32_e32 v19, v52 +; VI-NEXT: v_mov_b32_e32 v27, v3 +; VI-NEXT: v_mov_b32_e32 v52, v53 +; VI-NEXT: v_mov_b32_e32 v42, v7 +; VI-NEXT: v_mov_b32_e32 v41, v8 +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_mov_b32_e32 v55, v10 +; VI-NEXT: v_mov_b32_e32 v54, v11 +; VI-NEXT: v_mov_b32_e32 v53, v12 +; VI-NEXT: v_mov_b32_e32 v23, v13 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB71_2 +; +; GFX9-LABEL: bitcast_v64i8_to_v8i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, v30 +; GFX9-NEXT: v_mov_b32_e32 v61, v28 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v48 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v35 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v42 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v36, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v3 +; GFX9-NEXT: v_mov_b32_e32 v48, v8 +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v63, v59 +; GFX9-NEXT: v_mov_b32_e32 v59, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v47 +; GFX9-NEXT: v_mov_b32_e32 v47, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v34, v39 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v45, v25 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v24 +; GFX9-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-NEXT: v_mov_b32_e32 v26, v61 +; GFX9-NEXT: v_mov_b32_e32 v61, v23 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v29, v33 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v30, v37 +; GFX9-NEXT: v_mov_b32_e32 v37, v27 +; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB71_3 +; GFX9-NEXT: .LBB71_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s5, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s29, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v44 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v35 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s5, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: .LBB71_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB71_4: +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v39 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v29, v33 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v8 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_mov_b32_e32 v16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v24 +; GFX9-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-NEXT: v_mov_b32_e32 v26, v61 +; GFX9-NEXT: v_mov_b32_e32 v30, v37 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_mov_b32_e32 v40, v3 +; GFX9-NEXT: v_mov_b32_e32 v63, v59 +; GFX9-NEXT: v_mov_b32_e32 v36, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_mov_b32_e32 v59, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v47 +; GFX9-NEXT: v_mov_b32_e32 v47, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v25 +; GFX9-NEXT: v_mov_b32_e32 v61, v23 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_mov_b32_e32 v37, v27 +; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB71_2 +; +; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v8i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v96, v97 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v98, v99 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v96, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v86 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-TRUE16-NEXT: .LBB71_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v21, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB71_3: ; %end +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB71_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB71_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v8i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v38 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v85 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v69 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v96, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v96, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-FAKE16-NEXT: .LBB71_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v32 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v38 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v84, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v85, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v82, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v68, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v69, v11 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v70, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v80, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v65, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v67, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v64, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v49 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v27, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v29, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v54, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v55, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v17, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v19, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v21, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v23, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB71_3: ; %end +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB71_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB71_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f64_to_v32i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v55, v15 -; GCN-NEXT: v_mov_b32_e32 v54, v14 -; GCN-NEXT: v_mov_b32_e32 v53, v13 -; GCN-NEXT: v_mov_b32_e32 v52, v12 -; GCN-NEXT: v_mov_b32_e32 v51, v11 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v49, v9 -; GCN-NEXT: v_mov_b32_e32 v48, v8 -; GCN-NEXT: v_mov_b32_e32 v38, v7 -; GCN-NEXT: v_mov_b32_e32 v37, v6 -; GCN-NEXT: v_mov_b32_e32 v36, v5 -; GCN-NEXT: v_mov_b32_e32 v35, v4 -; GCN-NEXT: v_mov_b32_e32 v34, v3 -; GCN-NEXT: v_mov_b32_e32 v33, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v55, v54, 16 -; GCN-NEXT: v_alignbit_b32 v25, v53, v52, 16 -; GCN-NEXT: v_alignbit_b32 v21, v51, v50, 16 -; GCN-NEXT: v_alignbit_b32 v17, v49, v48, 16 -; GCN-NEXT: v_alignbit_b32 v13, v38, v37, 16 -; GCN-NEXT: v_alignbit_b32 v9, v36, v35, 16 -; GCN-NEXT: v_alignbit_b32 v5, v34, v33, 16 -; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: .LBB36_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 -; GCN-NEXT: v_add_f64 v[35:36], v[35:36], 1.0 -; GCN-NEXT: v_add_f64 v[37:38], v[37:38], 1.0 -; GCN-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 -; GCN-NEXT: v_add_f64 v[50:51], v[50:51], 1.0 -; GCN-NEXT: v_add_f64 v[52:53], v[52:53], 1.0 -; GCN-NEXT: v_add_f64 v[54:55], v[54:55], 1.0 -; GCN-NEXT: v_alignbit_b32 v29, v55, v54, 16 -; GCN-NEXT: v_alignbit_b32 v25, v53, v52, 16 -; GCN-NEXT: v_alignbit_b32 v21, v51, v50, 16 -; GCN-NEXT: v_alignbit_b32 v17, v49, v48, 16 -; GCN-NEXT: v_alignbit_b32 v13, v38, v37, 16 -; GCN-NEXT: v_alignbit_b32 v9, v36, v35, 16 -; GCN-NEXT: v_alignbit_b32 v5, v34, v33, 16 -; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: .LBB36_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v33 -; GCN-NEXT: v_mov_b32_e32 v6, v34 -; GCN-NEXT: v_mov_b32_e32 v8, v35 -; GCN-NEXT: v_mov_b32_e32 v10, v36 -; GCN-NEXT: v_mov_b32_e32 v12, v37 -; GCN-NEXT: v_mov_b32_e32 v14, v38 -; GCN-NEXT: v_mov_b32_e32 v16, v48 -; GCN-NEXT: v_mov_b32_e32 v18, v49 -; GCN-NEXT: v_mov_b32_e32 v20, v50 -; GCN-NEXT: v_mov_b32_e32 v22, v51 -; GCN-NEXT: v_mov_b32_e32 v24, v52 -; GCN-NEXT: v_mov_b32_e32 v26, v53 -; GCN-NEXT: v_mov_b32_e32 v28, v54 -; GCN-NEXT: v_mov_b32_e32 v30, v55 -; GCN-NEXT: v_mov_b32_e32 v1, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f64_to_v32i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v53, v13 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: v_mov_b32_e32 v51, v11 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v49, v9 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v38, v7 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB72_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v55, v54, 16 +; SI-NEXT: v_alignbit_b32 v25, v53, v52, 16 +; SI-NEXT: v_alignbit_b32 v21, v51, v50, 16 +; SI-NEXT: v_alignbit_b32 v17, v49, v48, 16 +; SI-NEXT: v_alignbit_b32 v13, v38, v37, 16 +; SI-NEXT: v_alignbit_b32 v9, v36, v35, 16 +; SI-NEXT: v_alignbit_b32 v5, v34, v33, 16 +; SI-NEXT: v_alignbit_b32 v32, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB72_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB72_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 +; SI-NEXT: v_add_f64 v[35:36], v[35:36], 1.0 +; SI-NEXT: v_add_f64 v[37:38], v[37:38], 1.0 +; SI-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 +; SI-NEXT: v_add_f64 v[50:51], v[50:51], 1.0 +; SI-NEXT: v_add_f64 v[54:55], v[54:55], 1.0 +; SI-NEXT: v_add_f64 v[52:53], v[52:53], 1.0 +; SI-NEXT: v_alignbit_b32 v29, v55, v54, 16 +; SI-NEXT: v_alignbit_b32 v25, v53, v52, 16 +; SI-NEXT: v_alignbit_b32 v21, v51, v50, 16 +; SI-NEXT: v_alignbit_b32 v17, v49, v48, 16 +; SI-NEXT: v_alignbit_b32 v13, v38, v37, 16 +; SI-NEXT: v_alignbit_b32 v9, v36, v35, 16 +; SI-NEXT: v_alignbit_b32 v5, v34, v33, 16 +; SI-NEXT: v_alignbit_b32 v32, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB72_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v4, v33 +; SI-NEXT: v_mov_b32_e32 v6, v34 +; SI-NEXT: v_mov_b32_e32 v8, v35 +; SI-NEXT: v_mov_b32_e32 v10, v36 +; SI-NEXT: v_mov_b32_e32 v12, v37 +; SI-NEXT: v_mov_b32_e32 v14, v38 +; SI-NEXT: v_mov_b32_e32 v16, v48 +; SI-NEXT: v_mov_b32_e32 v18, v49 +; SI-NEXT: v_mov_b32_e32 v20, v50 +; SI-NEXT: v_mov_b32_e32 v22, v51 +; SI-NEXT: v_mov_b32_e32 v24, v52 +; SI-NEXT: v_mov_b32_e32 v26, v53 +; SI-NEXT: v_mov_b32_e32 v28, v54 +; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: v_mov_b32_e32 v1, v32 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v32i16: ; VI: ; %bb.0: @@ -22712,7 +46011,7 @@ define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: s_cbranch_execz .LBB72_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -22722,7 +46021,7 @@ define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: .LBB72_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -22733,7 +46032,7 @@ define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: s_cbranch_execz .LBB72_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -22743,7 +46042,7 @@ define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: .LBB72_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -22755,7 +46054,7 @@ define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -22765,7 +46064,7 @@ define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: .LBB72_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -22785,183 +46084,421 @@ end: ret <32 x i16> %phi } +define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f64_to_v32i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v21, s27 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v25, s29 +; SI-NEXT: s_cbranch_scc0 .LBB73_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v33, v32, 16 +; SI-NEXT: v_alignbit_b32 v48, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v39, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v35, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: s_cbranch_execnz .LBB73_3 +; SI-NEXT: .LBB73_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_alignbit_b32 v29, v33, v32, 16 +; SI-NEXT: v_alignbit_b32 v48, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v39, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v35, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB73_3: ; %end +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v6, v5 +; SI-NEXT: v_mov_b32_e32 v10, v9 +; SI-NEXT: v_mov_b32_e32 v14, v13 +; SI-NEXT: v_mov_b32_e32 v18, v17 +; SI-NEXT: v_mov_b32_e32 v22, v21 +; SI-NEXT: v_mov_b32_e32 v26, v25 +; SI-NEXT: v_mov_b32_e32 v28, v32 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v1, v34 +; SI-NEXT: v_mov_b32_e32 v5, v35 +; SI-NEXT: v_mov_b32_e32 v9, v36 +; SI-NEXT: v_mov_b32_e32 v13, v37 +; SI-NEXT: v_mov_b32_e32 v17, v38 +; SI-NEXT: v_mov_b32_e32 v21, v39 +; SI-NEXT: v_mov_b32_e32 v25, v48 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB73_4: +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB73_2 +; +; VI-LABEL: bitcast_v8f64_to_v32i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB73_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB73_3 +; VI-NEXT: .LBB73_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB73_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB73_4: +; VI-NEXT: s_branch .LBB73_2 +; +; GFX9-LABEL: bitcast_v8f64_to_v32i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB73_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB73_3 +; GFX9-NEXT: .LBB73_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB73_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB73_4: +; GFX9-NEXT: s_branch .LBB73_2 +; +; GFX11-LABEL: bitcast_v8f64_to_v32i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB73_4 +; GFX11-NEXT: .LBB73_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], s[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB73_3: +; GFX11-NEXT: s_branch .LBB73_2 +; GFX11-NEXT: .LBB73_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i16_to_v8f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v12 -; GCN-NEXT: v_mov_b32_e32 v36, v10 -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_4 -; GCN-NEXT: .LBB37_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB37_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; GCN-NEXT: v_or_b32_e32 v0, v0, v54 -; GCN-NEXT: v_or_b32_e32 v1, v1, v55 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v2, v2, v39 -; GCN-NEXT: v_or_b32_e32 v3, v3, v48 -; GCN-NEXT: v_or_b32_e32 v4, v4, v49 -; GCN-NEXT: v_or_b32_e32 v5, v5, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v51 -; GCN-NEXT: v_or_b32_e32 v7, v7, v52 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_or_b32_e32 v15, v15, v53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB37_2 -; GCN-NEXT: .LBB37_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v55, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_or_b32_e32 v4, v49, v4 -; GCN-NEXT: v_or_b32_e32 v5, v50, v5 -; GCN-NEXT: v_or_b32_e32 v6, v51, v6 -; GCN-NEXT: v_or_b32_e32 v7, v52, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_or_b32_e32 v14, v29, v14 -; GCN-NEXT: v_or_b32_e32 v15, v53, v15 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i16_to_v8f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v36, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v55 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_or_b32_e32 v5, v5, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_or_b32_e32 v10, v10, v48 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_or_b32_e32 v13, v13, v21 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: .LBB74_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v53, v5 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_or_b32_e32 v7, v51, v7 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v48, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 +; SI-NEXT: .LBB74_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v8f64: ; VI: ; %bb.0: @@ -22970,7 +46507,7 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB37_2 +; VI-NEXT: s_cbranch_execz .LBB74_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 3 ; VI-NEXT: v_add_u16_e32 v16, 3, v15 @@ -23021,7 +46558,7 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v16, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v16, v0 -; VI-NEXT: .LBB37_2: ; %end +; VI-NEXT: .LBB74_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -23032,7 +46569,7 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB37_2 +; GFX9-NEXT: s_cbranch_execz .LBB74_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -23050,7 +46587,7 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB37_2: ; %end +; GFX9-NEXT: .LBB74_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -23062,7 +46599,7 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB37_2 +; GFX11-NEXT: s_cbranch_execz .LBB74_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -23080,7 +46617,7 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB37_2: ; %end +; GFX11-NEXT: .LBB74_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -23100,193 +46637,558 @@ end: ret <8 x double> %phi } +define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i16_to_v8f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v26, v14 +; SI-NEXT: v_mov_b32_e32 v25, v12 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_cbranch_scc0 .LBB75_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v32 +; SI-NEXT: v_or_b32_e32 v15, v0, v17 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB75_3 +; SI-NEXT: .LBB75_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB75_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB75_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB75_2 +; +; VI-LABEL: bitcast_v32i16_to_v8f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: s_cbranch_scc0 .LBB75_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB75_3 +; VI-NEXT: .LBB75_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s17, 3 +; VI-NEXT: s_and_b32 s10, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s18, 3 +; VI-NEXT: s_and_b32 s12, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s19, 3 +; VI-NEXT: s_and_b32 s14, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s20, 3 +; VI-NEXT: s_and_b32 s16, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s21, 3 +; VI-NEXT: s_and_b32 s18, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s22, 3 +; VI-NEXT: s_and_b32 s20, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s23, 3 +; VI-NEXT: s_and_b32 s22, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s24, 3 +; VI-NEXT: s_and_b32 s24, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s40, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s41, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s42, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s43, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s44, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s45, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s7, s45, s7 +; VI-NEXT: s_or_b32 s6, s44, s6 +; VI-NEXT: s_or_b32 s29, s43, s29 +; VI-NEXT: s_or_b32 s28, s42, s28 +; VI-NEXT: s_or_b32 s27, s41, s27 +; VI-NEXT: s_or_b32 s26, s40, s26 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_add_i32 s25, s24, 0x30000 +; VI-NEXT: s_add_i32 s24, s22, 0x30000 +; VI-NEXT: s_add_i32 s23, s20, 0x30000 +; VI-NEXT: s_add_i32 s22, s18, 0x30000 +; VI-NEXT: s_add_i32 s21, s16, 0x30000 +; VI-NEXT: s_add_i32 s20, s14, 0x30000 +; VI-NEXT: s_add_i32 s19, s12, 0x30000 +; VI-NEXT: s_add_i32 s18, s10, 0x30000 +; VI-NEXT: s_add_i32 s17, s8, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB75_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB75_4: +; VI-NEXT: s_branch .LBB75_2 +; +; GFX9-LABEL: bitcast_v32i16_to_v8f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB75_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB75_3 +; GFX9-NEXT: .LBB75_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB75_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB75_4: +; GFX9-NEXT: s_branch .LBB75_2 +; +; GFX11-LABEL: bitcast_v32i16_to_v8f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB75_4 +; GFX11-NEXT: .LBB75_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB75_3: +; GFX11-NEXT: s_branch .LBB75_2 +; GFX11-NEXT: .LBB75_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f64_to_v32f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: .LBB38_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: .LBB38_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v39 -; GCN-NEXT: v_mov_b32_e32 v1, v55 -; GCN-NEXT: v_mov_b32_e32 v2, v32 -; GCN-NEXT: v_mov_b32_e32 v3, v54 -; GCN-NEXT: v_mov_b32_e32 v4, v33 -; GCN-NEXT: v_mov_b32_e32 v5, v53 -; GCN-NEXT: v_mov_b32_e32 v6, v34 -; GCN-NEXT: v_mov_b32_e32 v7, v52 -; GCN-NEXT: v_mov_b32_e32 v8, v35 -; GCN-NEXT: v_mov_b32_e32 v9, v51 -; GCN-NEXT: v_mov_b32_e32 v10, v36 -; GCN-NEXT: v_mov_b32_e32 v11, v50 -; GCN-NEXT: v_mov_b32_e32 v12, v37 -; GCN-NEXT: v_mov_b32_e32 v13, v49 -; GCN-NEXT: v_mov_b32_e32 v14, v38 -; GCN-NEXT: v_mov_b32_e32 v15, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f64_to_v32f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: .LBB76_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: .LBB76_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v54 +; SI-NEXT: v_mov_b32_e32 v1, v55 +; SI-NEXT: v_mov_b32_e32 v2, v53 +; SI-NEXT: v_mov_b32_e32 v3, v52 +; SI-NEXT: v_mov_b32_e32 v4, v51 +; SI-NEXT: v_mov_b32_e32 v5, v49 +; SI-NEXT: v_mov_b32_e32 v6, v50 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: v_mov_b32_e32 v8, v48 +; SI-NEXT: v_mov_b32_e32 v9, v36 +; SI-NEXT: v_mov_b32_e32 v10, v38 +; SI-NEXT: v_mov_b32_e32 v11, v34 +; SI-NEXT: v_mov_b32_e32 v12, v37 +; SI-NEXT: v_mov_b32_e32 v13, v32 +; SI-NEXT: v_mov_b32_e32 v14, v35 +; SI-NEXT: v_mov_b32_e32 v15, v33 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v32f16: ; VI: ; %bb.0: @@ -23295,7 +47197,7 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: s_cbranch_execz .LBB76_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -23305,7 +47207,7 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: .LBB76_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -23316,7 +47218,7 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: s_cbranch_execz .LBB76_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -23326,7 +47228,7 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: .LBB76_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -23338,7 +47240,7 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: s_cbranch_execz .LBB76_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -23348,7 +47250,7 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB38_2: ; %end +; GFX11-NEXT: .LBB76_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -23368,269 +47270,555 @@ end: ret <32 x half> %phi } +define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f64_to_v32f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v0 +; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s6, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 +; SI-NEXT: s_lshr_b32 s6, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 +; SI-NEXT: s_lshr_b32 s6, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s6 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 +; SI-NEXT: s_lshr_b32 s6, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 +; SI-NEXT: s_lshr_b32 s6, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 +; SI-NEXT: s_lshr_b32 s6, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: s_lshr_b32 s6, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: .LBB77_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[14:15], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB77_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB77_2 +; +; VI-LABEL: bitcast_v8f64_to_v32f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB77_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB77_3 +; VI-NEXT: .LBB77_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB77_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB77_4: +; VI-NEXT: s_branch .LBB77_2 +; +; GFX9-LABEL: bitcast_v8f64_to_v32f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB77_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB77_3 +; GFX9-NEXT: .LBB77_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB77_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB77_4: +; GFX9-NEXT: s_branch .LBB77_2 +; +; GFX11-LABEL: bitcast_v8f64_to_v32f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB77_4 +; GFX11-NEXT: .LBB77_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], s[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB77_3: +; GFX11-NEXT: s_branch .LBB77_2 +; GFX11-NEXT: .LBB77_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f16_to_v8f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB39_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; GCN-NEXT: v_or_b32_e32 v0, v44, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v52, v2 -; GCN-NEXT: v_or_b32_e32 v3, v50, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v36, v6 -; GCN-NEXT: v_or_b32_e32 v7, v34, v7 -; GCN-NEXT: v_or_b32_e32 v8, v33, v8 -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: v_or_b32_e32 v10, v31, v10 -; GCN-NEXT: v_or_b32_e32 v11, v21, v11 -; GCN-NEXT: v_or_b32_e32 v12, v19, v12 -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: v_or_b32_e32 v14, v17, v14 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB39_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB39_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v26, v24 -; GCN-NEXT: v_or_b32_e32 v10, v28, v27 -; GCN-NEXT: v_or_b32_e32 v11, v21, v29 -; GCN-NEXT: v_or_b32_e32 v12, v19, v25 -; GCN-NEXT: v_or_b32_e32 v13, v18, v23 -; GCN-NEXT: v_or_b32_e32 v14, v17, v22 -; GCN-NEXT: v_or_b32_e32 v15, v16, v20 -; GCN-NEXT: .LBB39_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f16_to_v8f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_or_b32_e32 v5, v51, v5 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 +; SI-NEXT: v_or_b32_e32 v8, v37, v8 +; SI-NEXT: v_or_b32_e32 v9, v35, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v31, v11 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB78_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v35 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v23 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: .LBB78_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f16_to_v8f64: ; VI: ; %bb.0: @@ -23639,7 +47827,7 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB39_2 +; VI-NEXT: s_cbranch_execz .LBB78_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v16, 0x200 ; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -23690,7 +47878,7 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v17 ; VI-NEXT: v_or_b32_e32 v0, v0, v16 -; VI-NEXT: .LBB39_2: ; %end +; VI-NEXT: .LBB78_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -23701,7 +47889,7 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB39_2 +; GFX9-NEXT: s_cbranch_execz .LBB78_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -23720,7 +47908,7 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB39_2: ; %end +; GFX9-NEXT: .LBB78_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -23732,7 +47920,7 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB39_2 +; GFX11-NEXT: s_cbranch_execz .LBB78_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -23750,7 +47938,7 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB39_2: ; %end +; GFX11-NEXT: .LBB78_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -23770,150 +47958,550 @@ end: ret <8 x double> %phi } +define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f16_to_v8f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v40, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_or_b32_e32 v9, v29, v9 +; SI-NEXT: v_or_b32_e32 v10, v27, v10 +; SI-NEXT: v_or_b32_e32 v11, v25, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: .LBB79_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB79_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB79_2 +; +; VI-LABEL: bitcast_v32f16_to_v8f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB79_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB79_3 +; VI-NEXT: .LBB79_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v16, 0x200 +; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v17 +; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v17 +; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v17 +; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v17 +; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v17 +; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v17 +; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v17 +; VI-NEXT: v_or_b32_e32 v0, v0, v16 +; VI-NEXT: .LBB79_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB79_4: +; VI-NEXT: s_branch .LBB79_2 +; +; GFX9-LABEL: bitcast_v32f16_to_v8f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB79_3 +; GFX9-NEXT: .LBB79_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB79_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB79_4: +; GFX9-NEXT: s_branch .LBB79_2 +; +; GFX11-LABEL: bitcast_v32f16_to_v8f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB79_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB79_4 +; GFX11-NEXT: .LBB79_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB79_3: +; GFX11-NEXT: s_branch .LBB79_2 +; GFX11-NEXT: .LBB79_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f64_to_v32bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v6 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v1 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: .LBB40_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v6 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v1 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v0 -; GCN-NEXT: .LBB40_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v55 -; GCN-NEXT: v_mov_b32_e32 v1, v54 -; GCN-NEXT: v_mov_b32_e32 v2, v53 -; GCN-NEXT: v_mov_b32_e32 v3, v52 -; GCN-NEXT: v_mov_b32_e32 v4, v51 -; GCN-NEXT: v_mov_b32_e32 v5, v50 -; GCN-NEXT: v_mov_b32_e32 v6, v49 -; GCN-NEXT: v_mov_b32_e32 v7, v48 -; GCN-NEXT: v_mov_b32_e32 v8, v39 -; GCN-NEXT: v_mov_b32_e32 v9, v38 -; GCN-NEXT: v_mov_b32_e32 v10, v37 -; GCN-NEXT: v_mov_b32_e32 v11, v36 -; GCN-NEXT: v_mov_b32_e32 v12, v35 -; GCN-NEXT: v_mov_b32_e32 v13, v34 -; GCN-NEXT: v_mov_b32_e32 v14, v33 -; GCN-NEXT: v_mov_b32_e32 v15, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f64_to_v32bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: .LBB80_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; SI-NEXT: .LBB80_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v55 +; SI-NEXT: v_mov_b32_e32 v1, v54 +; SI-NEXT: v_mov_b32_e32 v2, v53 +; SI-NEXT: v_mov_b32_e32 v3, v52 +; SI-NEXT: v_mov_b32_e32 v4, v51 +; SI-NEXT: v_mov_b32_e32 v5, v50 +; SI-NEXT: v_mov_b32_e32 v6, v49 +; SI-NEXT: v_mov_b32_e32 v7, v48 +; SI-NEXT: v_mov_b32_e32 v8, v39 +; SI-NEXT: v_mov_b32_e32 v9, v38 +; SI-NEXT: v_mov_b32_e32 v10, v37 +; SI-NEXT: v_mov_b32_e32 v11, v36 +; SI-NEXT: v_mov_b32_e32 v12, v35 +; SI-NEXT: v_mov_b32_e32 v13, v34 +; SI-NEXT: v_mov_b32_e32 v14, v33 +; SI-NEXT: v_mov_b32_e32 v15, v32 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v32bf16: ; VI: ; %bb.0: @@ -23922,7 +48510,7 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: s_cbranch_execz .LBB80_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -23932,7 +48520,7 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB40_2: ; %end +; VI-NEXT: .LBB80_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -23943,7 +48531,7 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: s_cbranch_execz .LBB80_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -23953,7 +48541,7 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB40_2: ; %end +; GFX9-NEXT: .LBB80_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -23965,7 +48553,7 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-NEXT: s_cbranch_execz .LBB80_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -23975,7 +48563,7 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB40_2: ; %end +; GFX11-NEXT: .LBB80_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -23995,237 +48583,505 @@ end: ret <32 x bfloat> %phi } +define inreg <32 x bfloat> @bitcast_v8f64_to_v32bf16_scalar(<8 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f64_to_v32bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v54, s16 +; SI-NEXT: v_mov_b32_e32 v55, s17 +; SI-NEXT: v_mov_b32_e32 v52, s18 +; SI-NEXT: v_mov_b32_e32 v53, s19 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v51, s21 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v49, s23 +; SI-NEXT: v_mov_b32_e32 v38, s24 +; SI-NEXT: v_mov_b32_e32 v39, s25 +; SI-NEXT: v_mov_b32_e32 v36, s26 +; SI-NEXT: v_mov_b32_e32 v37, s27 +; SI-NEXT: v_mov_b32_e32 v34, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v35, s29 +; SI-NEXT: s_cbranch_scc0 .LBB81_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v34 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v34 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v39 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v48 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v51 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v50 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v52 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v55 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v54 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: s_cbranch_execnz .LBB81_3 +; SI-NEXT: .LBB81_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[54:55], v[54:55], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[52:53], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[50:51], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[48:49], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[38:39], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[36:37], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[34:35], 1.0 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v55 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v54 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: .LBB81_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v33 +; SI-NEXT: v_mov_b32_e32 v1, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB81_4: +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB81_2 +; +; VI-LABEL: bitcast_v8f64_to_v32bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB81_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB81_3 +; VI-NEXT: .LBB81_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB81_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB81_4: +; VI-NEXT: s_branch .LBB81_2 +; +; GFX9-LABEL: bitcast_v8f64_to_v32bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB81_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB81_3 +; GFX9-NEXT: .LBB81_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB81_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB81_4: +; GFX9-NEXT: s_branch .LBB81_2 +; +; GFX11-LABEL: bitcast_v8f64_to_v32bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB81_4 +; GFX11-NEXT: .LBB81_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], s[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB81_3: +; GFX11-NEXT: s_branch .LBB81_2 +; GFX11-NEXT: .LBB81_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v32bf16_to_v8f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v46 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB41_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v42 -; GCN-NEXT: v_alignbit_b32 v0, v0, v45, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v43, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_alignbit_b32 v2, v2, v51, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v49, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v39, 16 -; GCN-NEXT: v_alignbit_b32 v5, v5, v37, 16 -; GCN-NEXT: v_alignbit_b32 v6, v6, v36, 16 -; GCN-NEXT: v_alignbit_b32 v7, v7, v34, 16 -; GCN-NEXT: v_alignbit_b32 v8, v8, v33, 16 -; GCN-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; GCN-NEXT: v_alignbit_b32 v10, v10, v31, 16 -; GCN-NEXT: v_alignbit_b32 v11, v11, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v12, v19, 16 -; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; GCN-NEXT: v_alignbit_b32 v14, v14, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB41_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB41_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v31 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v10, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v11, v29, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v25, v19, 16 -; GCN-NEXT: v_alignbit_b32 v13, v23, v18, 16 -; GCN-NEXT: v_alignbit_b32 v14, v22, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v20, v16, 16 -; GCN-NEXT: .LBB41_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32bf16_to_v8f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: .LBB82_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: .LBB82_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v8f64: ; VI: ; %bb.0: @@ -24234,7 +49090,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB41_2 +; VI-NEXT: s_cbranch_execz .LBB82_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -24525,7 +49381,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 -; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -24536,7 +49392,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB41_2 +; GFX9-NEXT: s_cbranch_execz .LBB82_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -24780,7 +49636,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc ; GFX9-NEXT: v_perm_b32 v0, v0, v16, s7 -; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: .LBB82_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -24792,7 +49648,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -25066,7 +49922,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v22, v25, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v20, v0 -; GFX11-TRUE16-NEXT: .LBB41_2: ; %end +; GFX11-TRUE16-NEXT: .LBB82_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -25078,7 +49934,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v15 @@ -25356,7 +50212,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v21, v26, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB41_2: ; %end +; GFX11-FAKE16-NEXT: .LBB82_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -25376,425 +50232,1649 @@ end: ret <8 x double> %phi } +define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32bf16_to_v8f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: s_cbranch_scc0 .LBB83_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: s_cbranch_execnz .LBB83_3 +; SI-NEXT: .LBB83_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: .LBB83_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB83_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB83_2 +; +; VI-LABEL: bitcast_v32bf16_to_v8f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s31, v1 +; VI-NEXT: s_cbranch_scc0 .LBB83_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB83_4 +; VI-NEXT: .LBB83_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_branch .LBB83_5 +; VI-NEXT: .LBB83_3: +; VI-NEXT: s_branch .LBB83_2 +; VI-NEXT: .LBB83_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: .LBB83_5: ; %end +; VI-NEXT: v_readlane_b32 s31, v19, 1 +; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v8f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s31, v1 +; GFX9-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB83_4 +; GFX9-NEXT: .LBB83_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s31, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 +; GFX9-NEXT: s_branch .LBB83_5 +; GFX9-NEXT: .LBB83_3: +; GFX9-NEXT: s_branch .LBB83_2 +; GFX9-NEXT: .LBB83_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: .LBB83_5: ; %end +; GFX9-NEXT: v_readlane_b32 s31, v20, 1 +; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32bf16_to_v8f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB83_4 +; GFX11-NEXT: .LBB83_2: ; %cmp.true +; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s2, s26, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_and_b32 s1, s25, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s3, s25, 16 +; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s24, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s23, 16 +; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_lshl_b32 s1, s22, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s1, s21, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s20, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v8, v7 +; GFX11-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v16, v4, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v4 +; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s1, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v0 +; GFX11-NEXT: s_and_b32 s1, s17, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v6, v4 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v17, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v19, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_lshl_b32 s1, s16, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo +; GFX11-NEXT: v_bfe_u32 v17, v18, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v16, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v17, v18 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v19, v6, v16 +; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v16 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_lshl_b32 s1, s15, 16 +; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v3, v17, 16, 1 +; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v17 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX11-NEXT: v_bfe_u32 v19, v4, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4 +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s13, 16 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_lshl_b32 s0, s12, 16 +; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v21 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v19, v21, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v23, v17, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v18 +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v17 +; GFX11-NEXT: v_bfe_u32 v24, v22, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v18, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v2, v16, 16, v21 +; GFX11-NEXT: v_lshl_or_b32 v0, v20, 16, v17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB83_3: +; GFX11-NEXT: s_branch .LBB83_2 +; GFX11-NEXT: .LBB83_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f64_to_v64i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 8 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v24, v14, v13, 8 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 24 -; GCN-NEXT: v_alignbit_b32 v21, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v30, v12, v11, 8 -; GCN-NEXT: v_alignbit_b32 v27, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v28, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v34, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v32, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v37, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v39, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v40, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v52, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v42, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v54, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v55, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 24, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GCN-NEXT: .LBB42_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 8 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v24, v14, v13, 8 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 24 -; GCN-NEXT: v_alignbit_b32 v21, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v30, v12, v11, 8 -; GCN-NEXT: v_alignbit_b32 v27, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v28, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v34, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v32, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v37, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v39, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v40, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v52, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v42, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v54, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v55, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 24, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GCN-NEXT: .LBB42_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v45 -; GCN-NEXT: v_or_b32_e32 v45, v1, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; GCN-NEXT: v_or_b32_e32 v18, v2, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GCN-NEXT: v_or_b32_e32 v42, v3, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v4, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; GCN-NEXT: v_or_b32_e32 v40, v5, v1 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 24, v54 -; GCN-NEXT: v_and_b32_e32 v62, 0xff, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 24, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v61 -; GCN-NEXT: v_or_b32_e32 v14, v6, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v61, 0xff, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v48 -; GCN-NEXT: v_or_b32_e32 v15, v7, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v58, 0xff, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v59 -; GCN-NEXT: v_or_b32_e32 v16, v8, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v52, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v34 -; GCN-NEXT: v_or_b32_e32 v17, v9, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v56 -; GCN-NEXT: v_or_b32_e32 v34, v10, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v30 -; GCN-NEXT: v_or_b32_e32 v30, v11, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v44 -; GCN-NEXT: v_or_b32_e32 v11, v12, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v41, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v24 -; GCN-NEXT: v_or_b32_e32 v13, v13, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v50 -; GCN-NEXT: v_or_b32_e32 v24, v19, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v22 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GCN-NEXT: v_or_b32_e32 v21, v20, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 24, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v35 -; GCN-NEXT: v_or_b32_e32 v23, v23, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v19 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v55 -; GCN-NEXT: v_or_b32_e32 v36, v54, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v62 -; GCN-NEXT: v_or_b32_e32 v31, v60, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v61 -; GCN-NEXT: v_or_b32_e32 v56, v25, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 24, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v58 -; GCN-NEXT: v_or_b32_e32 v57, v57, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v40 -; GCN-NEXT: v_or_b32_e32 v37, v37, v52 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v39, v39, v47 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v32, v32, v46 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v33, v33, v43 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v27, v27, v41 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GCN-NEXT: v_or_b32_e32 v48, v48, v53 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v49, v49, v50 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v22, v22, v51 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v35, v35, v44 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v29, v29, v54 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v38, v38, v55 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v26, v59, v26 -; GCN-NEXT: v_or_b32_e32 v36, v45, v36 -; GCN-NEXT: v_or_b32_e32 v18, v18, v31 -; GCN-NEXT: v_or_b32_e32 v31, v42, v56 -; GCN-NEXT: v_or_b32_e32 v50, v58, v57 -; GCN-NEXT: v_or_b32_e32 v37, v40, v37 -; GCN-NEXT: v_or_b32_e32 v14, v14, v39 -; GCN-NEXT: v_or_b32_e32 v15, v15, v32 -; GCN-NEXT: v_or_b32_e32 v16, v16, v33 -; GCN-NEXT: v_or_b32_e32 v17, v17, v27 -; GCN-NEXT: v_or_b32_e32 v27, v34, v48 -; GCN-NEXT: v_or_b32_e32 v30, v30, v49 -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: v_or_b32_e32 v13, v13, v35 -; GCN-NEXT: v_or_b32_e32 v22, v24, v29 -; GCN-NEXT: v_or_b32_e32 v21, v21, v38 -; GCN-NEXT: v_or_b32_e32 v23, v23, v26 -; GCN-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f64_to_v64i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 8 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v24, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 8 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v30, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v36, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v51, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v40, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v47, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v57, v2, v1, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v6 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; SI-NEXT: .LBB84_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 8 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v24, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 8 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v30, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v36, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v51, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v40, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v47, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v57, v2, v1, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v6 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; SI-NEXT: .LBB84_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v47 +; SI-NEXT: v_or_b32_e32 v47, v47, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v47 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v17, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v64i8: ; VI: ; %bb.0: @@ -25868,7 +51948,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: s_cbranch_execz .LBB84_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -25921,9 +52001,9 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] -; VI-NEXT: .LBB42_2: ; %Flow +; VI-NEXT: .LBB84_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB42_4 +; VI-NEXT: s_cbranch_execz .LBB84_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 @@ -25983,7 +52063,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: .LBB42_4: ; %end +; VI-NEXT: .LBB84_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 @@ -26191,7 +52271,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: s_cbranch_execz .LBB84_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -26244,9 +52324,9 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: .LBB42_2: ; %Flow +; GFX9-NEXT: .LBB84_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB42_4 +; GFX9-NEXT: s_cbranch_execz .LBB84_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 @@ -26306,7 +52386,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB42_4: ; %end +; GFX9-NEXT: .LBB84_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 @@ -26466,7 +52546,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB84_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -26500,9 +52580,9 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB42_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB84_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB84_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 @@ -26544,7 +52624,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB42_4: ; %end +; GFX11-TRUE16-NEXT: .LBB84_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -26756,7 +52836,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB84_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -26806,9 +52886,9 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB42_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB84_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB84_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 @@ -26866,7 +52946,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB42_4: ; %end +; GFX11-FAKE16-NEXT: .LBB84_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -27036,606 +53116,2593 @@ end: ret <64 x i8> %phi } +define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f64_to_v64i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v2 +; SI-NEXT: s_cbranch_scc0 .LBB85_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_alignbit_b32 v2, s5, v1, 24 +; SI-NEXT: v_alignbit_b32 v17, s5, v1, 16 +; SI-NEXT: v_alignbit_b32 v18, s5, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s28 +; SI-NEXT: v_alignbit_b32 v20, s29, v1, 24 +; SI-NEXT: v_alignbit_b32 v4, s29, v1, 16 +; SI-NEXT: v_alignbit_b32 v19, s29, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s26 +; SI-NEXT: v_alignbit_b32 v6, s27, v1, 24 +; SI-NEXT: v_alignbit_b32 v21, s27, v1, 16 +; SI-NEXT: v_alignbit_b32 v22, s27, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s24 +; SI-NEXT: v_alignbit_b32 v8, s25, v1, 24 +; SI-NEXT: v_alignbit_b32 v23, s25, v1, 16 +; SI-NEXT: v_alignbit_b32 v24, s25, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s22 +; SI-NEXT: v_alignbit_b32 v10, s23, v1, 24 +; SI-NEXT: v_alignbit_b32 v25, s23, v1, 16 +; SI-NEXT: v_alignbit_b32 v26, s23, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s20 +; SI-NEXT: v_alignbit_b32 v12, s21, v1, 24 +; SI-NEXT: v_alignbit_b32 v14, s21, v1, 16 +; SI-NEXT: v_alignbit_b32 v16, s21, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s18 +; SI-NEXT: v_alignbit_b32 v27, s19, v1, 24 +; SI-NEXT: v_alignbit_b32 v28, s19, v1, 16 +; SI-NEXT: v_alignbit_b32 v29, s19, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: v_alignbit_b32 v30, s17, v1, 24 +; SI-NEXT: v_alignbit_b32 v31, s17, v1, 16 +; SI-NEXT: v_alignbit_b32 v32, s17, v1, 8 +; SI-NEXT: s_lshr_b32 s8, s5, 24 +; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_lshr_b32 s10, s5, 8 +; SI-NEXT: s_lshr_b32 s11, s29, 24 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s29, 8 +; SI-NEXT: s_lshr_b32 s14, s27, 24 +; SI-NEXT: s_lshr_b32 s15, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 8 +; SI-NEXT: s_lshr_b32 s41, s25, 24 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 8 +; SI-NEXT: s_lshr_b32 s44, s23, 24 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: s_lshr_b32 s47, s21, 24 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 8 +; SI-NEXT: s_lshr_b32 s58, s19, 24 +; SI-NEXT: s_lshr_b32 s59, s19, 16 +; SI-NEXT: s_lshr_b32 s60, s19, 8 +; SI-NEXT: s_lshr_b32 s61, s17, 24 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB85_4 +; SI-NEXT: .LBB85_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[15:16], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[13:14], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[9:10], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[5:6], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[28:29], 1.0 +; SI-NEXT: v_readfirstlane_b32 s5, v2 +; SI-NEXT: v_readfirstlane_b32 s29, v4 +; SI-NEXT: v_readfirstlane_b32 s27, v6 +; SI-NEXT: v_readfirstlane_b32 s25, v8 +; SI-NEXT: v_readfirstlane_b32 s23, v10 +; SI-NEXT: v_readfirstlane_b32 s21, v12 +; SI-NEXT: v_readfirstlane_b32 s19, v14 +; SI-NEXT: v_readfirstlane_b32 s17, v16 +; SI-NEXT: v_alignbit_b32 v2, s5, v1, 24 +; SI-NEXT: v_alignbit_b32 v17, s5, v1, 16 +; SI-NEXT: v_alignbit_b32 v18, s5, v1, 8 +; SI-NEXT: v_alignbit_b32 v20, s29, v3, 24 +; SI-NEXT: v_alignbit_b32 v4, s29, v3, 16 +; SI-NEXT: v_alignbit_b32 v19, s29, v3, 8 +; SI-NEXT: v_alignbit_b32 v6, s27, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, s27, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, s27, v5, 8 +; SI-NEXT: v_alignbit_b32 v8, s25, v7, 24 +; SI-NEXT: v_alignbit_b32 v23, s25, v7, 16 +; SI-NEXT: v_alignbit_b32 v24, s25, v7, 8 +; SI-NEXT: v_alignbit_b32 v10, s23, v9, 24 +; SI-NEXT: v_alignbit_b32 v25, s23, v9, 16 +; SI-NEXT: v_alignbit_b32 v26, s23, v9, 8 +; SI-NEXT: v_alignbit_b32 v12, s21, v11, 24 +; SI-NEXT: s_lshr_b32 s8, s5, 24 +; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_lshr_b32 s10, s5, 8 +; SI-NEXT: s_lshr_b32 s11, s29, 24 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s29, 8 +; SI-NEXT: s_lshr_b32 s14, s27, 24 +; SI-NEXT: s_lshr_b32 s15, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 8 +; SI-NEXT: s_lshr_b32 s41, s25, 24 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 8 +; SI-NEXT: s_lshr_b32 s44, s23, 24 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: s_lshr_b32 s47, s21, 24 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 8 +; SI-NEXT: s_lshr_b32 s58, s19, 24 +; SI-NEXT: s_lshr_b32 s59, s19, 16 +; SI-NEXT: s_lshr_b32 s60, s19, 8 +; SI-NEXT: s_lshr_b32 s61, s17, 24 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: v_alignbit_b32 v14, s21, v11, 16 +; SI-NEXT: v_alignbit_b32 v16, s21, v11, 8 +; SI-NEXT: v_alignbit_b32 v27, s19, v13, 24 +; SI-NEXT: v_alignbit_b32 v28, s19, v13, 16 +; SI-NEXT: v_alignbit_b32 v29, s19, v13, 8 +; SI-NEXT: v_alignbit_b32 v30, s17, v15, 24 +; SI-NEXT: v_alignbit_b32 v31, s17, v15, 16 +; SI-NEXT: v_alignbit_b32 v32, s17, v15, 8 +; SI-NEXT: s_branch .LBB85_5 +; SI-NEXT: .LBB85_3: +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_branch .LBB85_2 +; SI-NEXT: .LBB85_4: +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_mov_b32_e32 v3, s28 +; SI-NEXT: v_mov_b32_e32 v5, s26 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: .LBB85_5: ; %end +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s6, s63, 8 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s62, 0xff +; SI-NEXT: v_or_b32_e32 v15, v15, v32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v30 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s61, 24 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_or_b32_e32 v15, v15, v30 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v30, s4 +; SI-NEXT: buffer_store_dword v30, v15, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v29 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s6, s60, 8 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v28 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s59, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s58, 24 +; SI-NEXT: v_or_b32_e32 v15, v27, v15 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v0 +; SI-NEXT: v_mov_b32_e32 v15, s4 +; SI-NEXT: buffer_store_dword v15, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v16 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s6, s57, 8 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v14 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s56, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s47, 24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 20, v0 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v26 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s6, s46, 8 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v25 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s45, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s44, 24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v24 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s6, s43, 8 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v23 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s42, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s41, 24 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v22 +; SI-NEXT: s_and_b32 s4, s27, 0xff +; SI-NEXT: s_lshl_b32 s6, s40, 8 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v21 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s15, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s14, 24 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s6, s13, 8 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v19 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s12, 0xff +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v20 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s11, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; SI-NEXT: s_and_b32 s4, s5, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v17 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s9, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s8, 24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f64_to_v64i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v40, s30, 0 +; VI-NEXT: v_writelane_b32 v40, s31, 1 +; VI-NEXT: v_writelane_b32 v40, s34, 2 +; VI-NEXT: v_writelane_b32 v40, s35, 3 +; VI-NEXT: v_writelane_b32 v40, s36, 4 +; VI-NEXT: v_writelane_b32 v40, s37, 5 +; VI-NEXT: v_writelane_b32 v40, s38, 6 +; VI-NEXT: v_writelane_b32 v40, s39, 7 +; VI-NEXT: v_writelane_b32 v40, s48, 8 +; VI-NEXT: v_writelane_b32 v40, s49, 9 +; VI-NEXT: v_writelane_b32 v40, s50, 10 +; VI-NEXT: v_writelane_b32 v40, s51, 11 +; VI-NEXT: v_writelane_b32 v40, s52, 12 +; VI-NEXT: v_writelane_b32 v40, s53, 13 +; VI-NEXT: v_writelane_b32 v40, s54, 14 +; VI-NEXT: v_writelane_b32 v40, s55, 15 +; VI-NEXT: v_writelane_b32 v40, s64, 16 +; VI-NEXT: v_writelane_b32 v40, s65, 17 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_writelane_b32 v40, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: v_writelane_b32 v40, s67, 19 +; VI-NEXT: s_cbranch_scc0 .LBB85_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s37, s4, 16 +; VI-NEXT: s_lshr_b32 s36, s4, 8 +; VI-NEXT: s_lshr_b32 s59, s29, 24 +; VI-NEXT: s_lshr_b32 s60, s29, 16 +; VI-NEXT: s_lshr_b32 s61, s29, 8 +; VI-NEXT: s_lshr_b32 s39, s28, 16 +; VI-NEXT: s_lshr_b32 s38, s28, 8 +; VI-NEXT: s_lshr_b32 s62, s27, 24 +; VI-NEXT: s_lshr_b32 s63, s27, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 8 +; VI-NEXT: s_lshr_b32 s49, s26, 16 +; VI-NEXT: s_lshr_b32 s48, s26, 8 +; VI-NEXT: s_lshr_b32 s73, s25, 24 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s25, 8 +; VI-NEXT: s_lshr_b32 s51, s24, 16 +; VI-NEXT: s_lshr_b32 s50, s24, 8 +; VI-NEXT: s_lshr_b32 s76, s23, 24 +; VI-NEXT: s_lshr_b32 s77, s23, 16 +; VI-NEXT: s_lshr_b32 s78, s23, 8 +; VI-NEXT: s_lshr_b32 s53, s22, 16 +; VI-NEXT: s_lshr_b32 s52, s22, 8 +; VI-NEXT: s_lshr_b32 s79, s21, 24 +; VI-NEXT: s_lshr_b32 s88, s21, 16 +; VI-NEXT: s_lshr_b32 s89, s21, 8 +; VI-NEXT: s_lshr_b32 s55, s20, 16 +; VI-NEXT: s_lshr_b32 s54, s20, 8 +; VI-NEXT: s_lshr_b32 s90, s19, 24 +; VI-NEXT: s_lshr_b32 s91, s19, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 8 +; VI-NEXT: s_lshr_b32 s65, s18, 16 +; VI-NEXT: s_lshr_b32 s64, s18, 8 +; VI-NEXT: s_lshr_b32 s31, s17, 24 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s17, 8 +; VI-NEXT: s_lshr_b32 s67, s16, 16 +; VI-NEXT: s_lshr_b32 s66, s16, 8 +; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB85_4 +; VI-NEXT: .LBB85_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 +; VI-NEXT: v_add_f64 v[3:4], s[28:29], 1.0 +; VI-NEXT: v_add_f64 v[5:6], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[11:12], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[15:16], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[9:10], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[13:14], s[16:17], 1.0 +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] +; VI-NEXT: v_readfirstlane_b32 s17, v14 +; VI-NEXT: v_readfirstlane_b32 s19, v10 +; VI-NEXT: v_readfirstlane_b32 s21, v16 +; VI-NEXT: v_readfirstlane_b32 s23, v12 +; VI-NEXT: v_readfirstlane_b32 s25, v8 +; VI-NEXT: v_readfirstlane_b32 s27, v6 +; VI-NEXT: v_readfirstlane_b32 s29, v4 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v1 +; VI-NEXT: s_lshr_b32 s59, s29, 24 +; VI-NEXT: s_lshr_b32 s60, s29, 16 +; VI-NEXT: s_lshr_b32 s61, s29, 8 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; VI-NEXT: s_lshr_b32 s62, s27, 24 +; VI-NEXT: s_lshr_b32 s63, s27, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 8 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: s_lshr_b32 s73, s25, 24 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s25, 8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; VI-NEXT: s_lshr_b32 s76, s23, 24 +; VI-NEXT: s_lshr_b32 s77, s23, 16 +; VI-NEXT: s_lshr_b32 s78, s23, 8 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v11 +; VI-NEXT: s_lshr_b32 s79, s21, 24 +; VI-NEXT: s_lshr_b32 s88, s21, 16 +; VI-NEXT: s_lshr_b32 s89, s21, 8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v15 +; VI-NEXT: s_lshr_b32 s90, s19, 24 +; VI-NEXT: s_lshr_b32 s91, s19, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 8 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; VI-NEXT: s_lshr_b32 s31, s17, 24 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s17, 8 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v13 +; VI-NEXT: s_branch .LBB85_5 +; VI-NEXT: .LBB85_3: +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: s_branch .LBB85_2 +; VI-NEXT: .LBB85_4: +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: v_mov_b32_e32 v9, s18 +; VI-NEXT: v_mov_b32_e32 v48, s67 +; VI-NEXT: v_mov_b32_e32 v49, s66 +; VI-NEXT: v_mov_b32_e32 v38, s65 +; VI-NEXT: v_mov_b32_e32 v39, s64 +; VI-NEXT: v_mov_b32_e32 v36, s55 +; VI-NEXT: v_mov_b32_e32 v37, s54 +; VI-NEXT: v_mov_b32_e32 v34, s53 +; VI-NEXT: v_mov_b32_e32 v35, s52 +; VI-NEXT: v_mov_b32_e32 v32, s51 +; VI-NEXT: v_mov_b32_e32 v33, s50 +; VI-NEXT: v_mov_b32_e32 v30, s49 +; VI-NEXT: v_mov_b32_e32 v31, s48 +; VI-NEXT: v_mov_b32_e32 v28, s39 +; VI-NEXT: v_mov_b32_e32 v29, s38 +; VI-NEXT: v_mov_b32_e32 v26, s37 +; VI-NEXT: v_mov_b32_e32 v27, s36 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v11, s22 +; VI-NEXT: v_mov_b32_e32 v7, s24 +; VI-NEXT: v_mov_b32_e32 v5, s26 +; VI-NEXT: v_mov_b32_e32 v3, s28 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v22, s10 +; VI-NEXT: v_mov_b32_e32 v21, s12 +; VI-NEXT: v_mov_b32_e32 v20, s14 +; VI-NEXT: v_mov_b32_e32 v19, s40 +; VI-NEXT: v_mov_b32_e32 v18, s42 +; VI-NEXT: v_mov_b32_e32 v17, s44 +; VI-NEXT: .LBB85_5: ; %end +; VI-NEXT: s_and_b32 s4, s17, 0xff +; VI-NEXT: s_lshl_b32 s6, s35, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s34, 0xff +; VI-NEXT: s_lshl_b32 s7, s31, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v24 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v48, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s19, 0xff +; VI-NEXT: s_lshl_b32 s6, s30, 8 +; VI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s91, 0xff +; VI-NEXT: s_lshl_b32 s7, s90, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v38, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s21, 0xff +; VI-NEXT: s_lshl_b32 s6, s89, 8 +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s88, 0xff +; VI-NEXT: s_lshl_b32 s7, s79, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s23, 0xff +; VI-NEXT: s_lshl_b32 s6, s78, 8 +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s77, 0xff +; VI-NEXT: s_lshl_b32 s7, s76, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v21 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 24, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s25, 0xff +; VI-NEXT: s_lshl_b32 s6, s75, 8 +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s74, 0xff +; VI-NEXT: s_lshl_b32 s7, s73, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v33 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v20 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v32, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s27, 0xff +; VI-NEXT: s_lshl_b32 s6, s72, 8 +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s63, 0xff +; VI-NEXT: s_lshl_b32 s7, s62, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v19 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s29, 0xff +; VI-NEXT: s_lshl_b32 s6, s61, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s60, 0xff +; VI-NEXT: s_lshl_b32 s7, s59, 8 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: s_lshl_b32 s5, s58, 8 +; VI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s57, 0xff +; VI-NEXT: s_lshl_b32 s6, s56, 8 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s67, v40, 19 +; VI-NEXT: v_readlane_b32 s66, v40, 18 +; VI-NEXT: v_readlane_b32 s65, v40, 17 +; VI-NEXT: v_readlane_b32 s64, v40, 16 +; VI-NEXT: v_readlane_b32 s55, v40, 15 +; VI-NEXT: v_readlane_b32 s54, v40, 14 +; VI-NEXT: v_readlane_b32 s53, v40, 13 +; VI-NEXT: v_readlane_b32 s52, v40, 12 +; VI-NEXT: v_readlane_b32 s51, v40, 11 +; VI-NEXT: v_readlane_b32 s50, v40, 10 +; VI-NEXT: v_readlane_b32 s49, v40, 9 +; VI-NEXT: v_readlane_b32 s48, v40, 8 +; VI-NEXT: v_readlane_b32 s39, v40, 7 +; VI-NEXT: v_readlane_b32 s38, v40, 6 +; VI-NEXT: v_readlane_b32 s37, v40, 5 +; VI-NEXT: v_readlane_b32 s36, v40, 4 +; VI-NEXT: v_readlane_b32 s35, v40, 3 +; VI-NEXT: v_readlane_b32 s34, v40, 2 +; VI-NEXT: v_readlane_b32 s31, v40, 1 +; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f64_to_v64i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s35, 3 +; GFX9-NEXT: v_writelane_b32 v40, s36, 4 +; GFX9-NEXT: v_writelane_b32 v40, s37, 5 +; GFX9-NEXT: v_writelane_b32 v40, s38, 6 +; GFX9-NEXT: v_writelane_b32 v40, s39, 7 +; GFX9-NEXT: v_writelane_b32 v40, s48, 8 +; GFX9-NEXT: v_writelane_b32 v40, s49, 9 +; GFX9-NEXT: v_writelane_b32 v40, s50, 10 +; GFX9-NEXT: v_writelane_b32 v40, s51, 11 +; GFX9-NEXT: v_writelane_b32 v40, s52, 12 +; GFX9-NEXT: v_writelane_b32 v40, s53, 13 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_writelane_b32 v40, s54, 14 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: v_writelane_b32 v40, s55, 15 +; GFX9-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s58, s5, 8 +; GFX9-NEXT: s_lshr_b32 s31, s4, 16 +; GFX9-NEXT: s_lshr_b32 s30, s4, 8 +; GFX9-NEXT: s_lshr_b32 s59, s29, 24 +; GFX9-NEXT: s_lshr_b32 s60, s29, 16 +; GFX9-NEXT: s_lshr_b32 s61, s29, 8 +; GFX9-NEXT: s_lshr_b32 s35, s28, 16 +; GFX9-NEXT: s_lshr_b32 s34, s28, 8 +; GFX9-NEXT: s_lshr_b32 s62, s27, 24 +; GFX9-NEXT: s_lshr_b32 s63, s27, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 8 +; GFX9-NEXT: s_lshr_b32 s37, s26, 16 +; GFX9-NEXT: s_lshr_b32 s36, s26, 8 +; GFX9-NEXT: s_lshr_b32 s73, s25, 24 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s25, 8 +; GFX9-NEXT: s_lshr_b32 s39, s24, 16 +; GFX9-NEXT: s_lshr_b32 s38, s24, 8 +; GFX9-NEXT: s_lshr_b32 s76, s23, 24 +; GFX9-NEXT: s_lshr_b32 s77, s23, 16 +; GFX9-NEXT: s_lshr_b32 s78, s23, 8 +; GFX9-NEXT: s_lshr_b32 s49, s22, 16 +; GFX9-NEXT: s_lshr_b32 s48, s22, 8 +; GFX9-NEXT: s_lshr_b32 s79, s21, 24 +; GFX9-NEXT: s_lshr_b32 s88, s21, 16 +; GFX9-NEXT: s_lshr_b32 s89, s21, 8 +; GFX9-NEXT: s_lshr_b32 s51, s20, 16 +; GFX9-NEXT: s_lshr_b32 s50, s20, 8 +; GFX9-NEXT: s_lshr_b32 s90, s19, 24 +; GFX9-NEXT: s_lshr_b32 s91, s19, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 8 +; GFX9-NEXT: s_lshr_b32 s53, s18, 16 +; GFX9-NEXT: s_lshr_b32 s52, s18, 8 +; GFX9-NEXT: s_lshr_b32 s93, s17, 24 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s17, 8 +; GFX9-NEXT: s_lshr_b32 s55, s16, 16 +; GFX9-NEXT: s_lshr_b32 s54, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB85_4 +; GFX9-NEXT: .LBB85_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[3:4], s[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[5:6], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[9:10], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], s[16:17], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12] +; GFX9-NEXT: v_readfirstlane_b32 s17, v14 +; GFX9-NEXT: v_readfirstlane_b32 s19, v12 +; GFX9-NEXT: v_readfirstlane_b32 s21, v16 +; GFX9-NEXT: v_readfirstlane_b32 s23, v10 +; GFX9-NEXT: v_readfirstlane_b32 s25, v8 +; GFX9-NEXT: v_readfirstlane_b32 s27, v6 +; GFX9-NEXT: v_readfirstlane_b32 s29, v4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s58, s5, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v1 +; GFX9-NEXT: s_lshr_b32 s59, s29, 24 +; GFX9-NEXT: s_lshr_b32 s60, s29, 16 +; GFX9-NEXT: s_lshr_b32 s61, s29, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX9-NEXT: s_lshr_b32 s62, s27, 24 +; GFX9-NEXT: s_lshr_b32 s63, s27, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: s_lshr_b32 s73, s25, 24 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s25, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; GFX9-NEXT: s_lshr_b32 s76, s23, 24 +; GFX9-NEXT: s_lshr_b32 s77, s23, 16 +; GFX9-NEXT: s_lshr_b32 s78, s23, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v9 +; GFX9-NEXT: s_lshr_b32 s79, s21, 24 +; GFX9-NEXT: s_lshr_b32 s88, s21, 16 +; GFX9-NEXT: s_lshr_b32 s89, s21, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v15 +; GFX9-NEXT: s_lshr_b32 s90, s19, 24 +; GFX9-NEXT: s_lshr_b32 s91, s19, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: s_lshr_b32 s93, s17, 24 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s17, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v13 +; GFX9-NEXT: s_branch .LBB85_5 +; GFX9-NEXT: .LBB85_3: +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: s_branch .LBB85_2 +; GFX9-NEXT: .LBB85_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s18 +; GFX9-NEXT: v_mov_b32_e32 v39, s55 +; GFX9-NEXT: v_mov_b32_e32 v49, s54 +; GFX9-NEXT: v_mov_b32_e32 v37, s53 +; GFX9-NEXT: v_mov_b32_e32 v48, s52 +; GFX9-NEXT: v_mov_b32_e32 v36, s51 +; GFX9-NEXT: v_mov_b32_e32 v38, s50 +; GFX9-NEXT: v_mov_b32_e32 v34, s49 +; GFX9-NEXT: v_mov_b32_e32 v35, s48 +; GFX9-NEXT: v_mov_b32_e32 v32, s39 +; GFX9-NEXT: v_mov_b32_e32 v33, s38 +; GFX9-NEXT: v_mov_b32_e32 v30, s37 +; GFX9-NEXT: v_mov_b32_e32 v31, s36 +; GFX9-NEXT: v_mov_b32_e32 v28, s35 +; GFX9-NEXT: v_mov_b32_e32 v29, s34 +; GFX9-NEXT: v_mov_b32_e32 v26, s31 +; GFX9-NEXT: v_mov_b32_e32 v27, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v9, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s24 +; GFX9-NEXT: v_mov_b32_e32 v5, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v24, s6 +; GFX9-NEXT: v_mov_b32_e32 v23, s8 +; GFX9-NEXT: v_mov_b32_e32 v22, s10 +; GFX9-NEXT: v_mov_b32_e32 v21, s12 +; GFX9-NEXT: v_mov_b32_e32 v20, s14 +; GFX9-NEXT: v_mov_b32_e32 v19, s40 +; GFX9-NEXT: v_mov_b32_e32 v18, s42 +; GFX9-NEXT: v_mov_b32_e32 v17, s44 +; GFX9-NEXT: .LBB85_5: ; %end +; GFX9-NEXT: s_and_b32 s4, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s95, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s94, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s93, 8 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v49 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v39, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s92, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s91, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s90, 8 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v48 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v23 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s89, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s88, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s79, 8 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v38 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s78, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s76, 8 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v21 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s75, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s74, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s73, 8 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v20 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v32, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s72, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s63, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s62, 8 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v19 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s61, 8 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s60, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s59, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s5, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s58, 8 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v27 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s57, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s56, 8 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_readlane_b32 s55, v40, 15 +; GFX9-NEXT: v_readlane_b32 s54, v40, 14 +; GFX9-NEXT: v_readlane_b32 s53, v40, 13 +; GFX9-NEXT: v_readlane_b32 s52, v40, 12 +; GFX9-NEXT: v_readlane_b32 s51, v40, 11 +; GFX9-NEXT: v_readlane_b32 s50, v40, 10 +; GFX9-NEXT: v_readlane_b32 s49, v40, 9 +; GFX9-NEXT: v_readlane_b32 s48, v40, 8 +; GFX9-NEXT: v_readlane_b32 s39, v40, 7 +; GFX9-NEXT: v_readlane_b32 s38, v40, 6 +; GFX9-NEXT: v_readlane_b32 s37, v40, 5 +; GFX9-NEXT: v_readlane_b32 s36, v40, 4 +; GFX9-NEXT: v_readlane_b32 s35, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v8f64_to_v64i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v33, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s30, 0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s90, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s50, 10 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB85_4 +; GFX11-TRUE16-NEXT: .LBB85_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], s[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], s[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], s[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], s[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], s[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], s[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], s[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[1:2], s[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v25 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v21 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s17, v17 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s19, v15 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s21, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s23, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s25, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s27, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[20:21] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[2:3], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[4:5] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[14:15] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[16:17] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s1, 8 +; GFX11-TRUE16-NEXT: s_branch .LBB85_5 +; GFX11-TRUE16-NEXT: .LBB85_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr95_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB85_2 +; GFX11-TRUE16-NEXT: .LBB85_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s95 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s30 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s93 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s91 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s92 +; GFX11-TRUE16-NEXT: .LBB85_5: ; %end +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s89 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s88 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s79 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s78 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v26 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s76 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s77 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v20, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v22 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v24, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s74 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v24, v27 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v27, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s75 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v22 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v18 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s73 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s62 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s72 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v14, v18 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v12 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v19, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s61 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s60 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s59 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s56 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s58 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s57 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v10 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s47 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s46 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s45 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v7, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v1, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s42 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v20 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[24:27], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[16:19], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v33, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v33, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v33, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v33, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v33, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v33, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v33, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v33, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v33, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v33, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v33, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v8f64_to_v64i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v33, s32 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s30, 0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s90, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s49, 9 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB85_4 +; GFX11-FAKE16-NEXT: .LBB85_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], s[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], s[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], s[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], s[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], s[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], s[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], s[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[1:2], s[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[18:19] +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v25 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v23 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s17, v19 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s19, v15 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s21, v11 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s23, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s25, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s27, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[22:23] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[6:7], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[8:9] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[20:21], 24, v[14:15] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[24:25] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s1, 8 +; GFX11-FAKE16-NEXT: s_branch .LBB85_5 +; GFX11-FAKE16-NEXT: .LBB85_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: s_branch .LBB85_2 +; GFX11-FAKE16-NEXT: .LBB85_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v3, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v1, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v27, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v5, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v31, s49 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v29, s39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s4 :: v_dual_mov_b32 v23, s37 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s8 :: v_dual_mov_b32 v25, s36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s10 :: v_dual_mov_b32 v19, s35 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s12 :: v_dual_mov_b32 v21, s34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v15, s31 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v17, s30 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s48 :: v_dual_mov_b32 v11, vcc_hi +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s38 :: v_dual_mov_b32 v13, s95 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s94 :: v_dual_mov_b32 v2, s92 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s93 :: v_dual_mov_b32 v4, s91 +; GFX11-FAKE16-NEXT: .LBB85_5: ; %end +; GFX11-FAKE16-NEXT: s_and_b32 s0, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s89, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s79, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s88, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s3, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s78, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s77, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s76, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v31, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v25, s1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s75, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s73, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v31, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s62, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v23 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v23, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s74, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s72, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s19, 0xff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s63, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v20 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v17 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s0 :: v_dual_lshlrev_b32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v17, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s61, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s59, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s60, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s58, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s57, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s56, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v8, v11 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s1 :: v_dual_and_b32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s25, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s47, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s46, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s45, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v14, v21 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s44, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s27, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s3, s43, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s42, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v18, v19 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[22:25], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[14:17], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[10:13], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v33, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v33, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v33, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v33, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v33, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v33, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v33, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v33, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v33, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v33, 0 +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i8_to_v8f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v34, v12 -; GCN-NEXT: v_mov_b32_e32 v37, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v36, v6 -; GCN-NEXT: v_mov_b32_e32 v32, v4 -; GCN-NEXT: v_mov_b32_e32 v35, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v44 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 24, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 24, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 24, v46 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB43_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_or_b32_e32 v0, v0, v42 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_or_b32_e32 v1, v1, v41 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v40 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_or_b32_e32 v3, v3, v55 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v37 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v38 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v50 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v54 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v53 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v52 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v49 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v48 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v39 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v63 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v62 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v61 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32 -; GCN-NEXT: v_or_b32_e32 v17, v33, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v34 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v28, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; GCN-NEXT: v_or_b32_e32 v27, v35, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v29, v36, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v37 -; GCN-NEXT: v_or_b32_e32 v31, v38, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v18, v18, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v25, v59, v25 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v30, v57, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v32, v58, v8 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v9 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v8, v10 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v11 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v8, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v8, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v8, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v19, v46, v19 -; GCN-NEXT: v_or_b32_e32 v0, v0, v7 -; GCN-NEXT: v_or_b32_e32 v1, v1, v6 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: v_or_b32_e32 v4, v17, v21 -; GCN-NEXT: v_or_b32_e32 v5, v20, v22 -; GCN-NEXT: v_or_b32_e32 v6, v23, v24 -; GCN-NEXT: v_or_b32_e32 v7, v26, v28 -; GCN-NEXT: v_or_b32_e32 v8, v27, v25 -; GCN-NEXT: v_or_b32_e32 v9, v29, v30 -; GCN-NEXT: v_or_b32_e32 v10, v31, v32 -; GCN-NEXT: v_or_b32_e32 v11, v33, v34 -; GCN-NEXT: v_or_b32_e32 v12, v35, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v16 -; GCN-NEXT: v_or_b32_e32 v15, v18, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; kill: killed $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: .LBB43_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB43_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v41, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v40, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v55, v3 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v50 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v51 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GCN-NEXT: v_or_b32_e32 v6, v17, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v29, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v27, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GCN-NEXT: v_or_b32_e32 v17, v44, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; GCN-NEXT: v_or_b32_e32 v20, v45, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GCN-NEXT: v_or_b32_e32 v23, v56, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v30 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v32 -; GCN-NEXT: v_or_b32_e32 v29, v43, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v19, v47, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v34 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v31, v36 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_or_b32_e32 v16, v59, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 -; GCN-NEXT: v_or_b32_e32 v18, v57, v18 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: v_or_b32_e32 v22, v58, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x300, v19 -; GCN-NEXT: v_or_b32_e32 v30, v46, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v0, v31, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v10, v9 -; GCN-NEXT: v_or_b32_e32 v6, v12, v11 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v16, v15 -; GCN-NEXT: v_or_b32_e32 v9, v18, v17 -; GCN-NEXT: v_or_b32_e32 v10, v22, v20 -; GCN-NEXT: v_or_b32_e32 v11, v24, v23 -; GCN-NEXT: v_or_b32_e32 v12, v26, v25 -; GCN-NEXT: v_or_b32_e32 v13, v28, v27 -; GCN-NEXT: v_or_b32_e32 v14, v21, v29 -; GCN-NEXT: v_or_b32_e32 v15, v30, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 -; GCN-NEXT: .LBB43_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i8_to_v8f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v8 +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v50 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v41 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v45 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v56 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v13 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB86_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v53, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v23, v15 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v8, v8, v62 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v59 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: .LBB86_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB86_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v53, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v23, v15 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v17, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v30, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v40, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v13, v50, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v25, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v21, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 +; SI-NEXT: .LBB86_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v8f64: ; VI: ; %bb.0: @@ -27749,7 +55816,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB43_2 +; VI-NEXT: s_cbranch_execz .LBB86_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -27913,9 +55980,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: .LBB43_2: ; %Flow +; VI-NEXT: .LBB86_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB43_4 +; VI-NEXT: s_cbranch_execz .LBB86_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -28064,7 +56131,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: .LBB43_4: ; %end +; VI-NEXT: .LBB86_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -28209,7 +56276,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB43_2 +; GFX9-NEXT: s_cbranch_execz .LBB86_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -28373,9 +56440,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr21 -; GFX9-NEXT: .LBB43_2: ; %Flow +; GFX9-NEXT: .LBB86_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB43_4 +; GFX9-NEXT: s_cbranch_execz .LBB86_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -28524,7 +56591,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: .LBB43_4: ; %end +; GFX9-NEXT: .LBB86_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -28651,15 +56718,15 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_4 -; GFX11-TRUE16-NEXT: .LBB43_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_4 +; GFX11-TRUE16-NEXT: .LBB86_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB43_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h @@ -28841,8 +56908,8 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB43_2 -; GFX11-TRUE16-NEXT: .LBB43_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 +; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 @@ -29121,15 +57188,15 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_4 -; GFX11-FAKE16-NEXT: .LBB43_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_4 +; GFX11-FAKE16-NEXT: .LBB86_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB43_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v37 @@ -29308,8 +57375,8 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB43_2 -; GFX11-FAKE16-NEXT: .LBB43_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB86_2 +; GFX11-FAKE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -29506,254 +57573,2355 @@ end: ret <8 x double> %phi } +define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i8_to_v8f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v51 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v30 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v42 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v44 +; SI-NEXT: s_cbranch_scc0 .LBB87_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_mov_b32_e32 v61, v57 +; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v24, v26 +; SI-NEXT: v_mov_b32_e32 v26, v28 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v23, v21 +; SI-NEXT: v_mov_b32_e32 v21, v19 +; SI-NEXT: v_mov_b32_e32 v19, v17 +; SI-NEXT: v_mov_b32_e32 v17, v13 +; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v27, v42 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB87_3 +; SI-NEXT: .LBB87_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB87_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB87_4: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v27, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v24, v26 +; SI-NEXT: v_mov_b32_e32 v26, v28 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v61, v57 +; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v23, v21 +; SI-NEXT: v_mov_b32_e32 v21, v19 +; SI-NEXT: v_mov_b32_e32 v19, v17 +; SI-NEXT: v_mov_b32_e32 v17, v13 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB87_2 +; +; VI-LABEL: bitcast_v64i8_to_v8f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v48, v14 +; VI-NEXT: v_mov_b32_e32 v49, v13 +; VI-NEXT: v_mov_b32_e32 v50, v12 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v31, v8 +; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v27 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v2 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v6 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB87_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; VI-NEXT: v_or_b32_sdwa v0, v39, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v42, v7 +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v8 +; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v26, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v10 +; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v56 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v11 +; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v19, v52 +; VI-NEXT: v_mov_b32_e32 v52, v53 +; VI-NEXT: v_mov_b32_e32 v53, v12 +; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v23, v13 +; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v59 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v47, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_mov_b32_e32 v27, v3 +; VI-NEXT: v_or_b32_sdwa v2, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB87_3 +; VI-NEXT: .LBB87_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v48 +; VI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; VI-NEXT: v_or_b32_sdwa v17, v42, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v17 +; VI-NEXT: v_or_b32_e32 v7, v7, v16 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v7, v7, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v20 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 +; VI-NEXT: v_or_b32_sdwa v16, v41, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v24 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 +; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v40, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v36 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v28 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v51, vcc, 3, v57 +; VI-NEXT: v_lshlrev_b32_e32 v10, 24, v29 +; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v55, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v51 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 +; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v46 +; VI-NEXT: v_lshlrev_b32_e32 v11, 24, v56 +; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v54, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v39 +; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v38 +; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v62 +; VI-NEXT: v_lshlrev_b32_e32 v12, 24, v45 +; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v53, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v37 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v36 +; VI-NEXT: v_lshlrev_b32_e32 v13, 24, v61 +; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v16, v23, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; VI-NEXT: v_lshlrev_b32_e32 v14, 24, v59 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s9, s17, 8 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s10, s18, 0xff +; VI-NEXT: s_lshl_b32 s8, s19, 24 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s21, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s9, s20, 0xff +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s22, 0xff +; VI-NEXT: s_lshl_b32 s4, s29, 8 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s6, s23, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s9 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s5, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v31 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v47 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v34 +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v63 +; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v49 +; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: v_or_b32_sdwa v31, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_or_b32_sdwa v25, v52, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x300, v31 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v16, v16, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v5, v5, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v25 +; VI-NEXT: v_or_b32_e32 v6, v6, v21 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v16 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_add_i32 s8, s8, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: .LBB87_3: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB87_4: +; VI-NEXT: v_mov_b32_e32 v19, v52 +; VI-NEXT: v_mov_b32_e32 v27, v3 +; VI-NEXT: v_mov_b32_e32 v52, v53 +; VI-NEXT: v_mov_b32_e32 v42, v7 +; VI-NEXT: v_mov_b32_e32 v41, v8 +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_mov_b32_e32 v55, v10 +; VI-NEXT: v_mov_b32_e32 v54, v11 +; VI-NEXT: v_mov_b32_e32 v53, v12 +; VI-NEXT: v_mov_b32_e32 v23, v13 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB87_2 +; +; GFX9-LABEL: bitcast_v64i8_to_v8f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, v30 +; GFX9-NEXT: v_mov_b32_e32 v61, v28 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v48 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v35 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v42 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; GFX9-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v36, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v3 +; GFX9-NEXT: v_mov_b32_e32 v48, v8 +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v63, v59 +; GFX9-NEXT: v_mov_b32_e32 v59, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v47 +; GFX9-NEXT: v_mov_b32_e32 v47, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v34, v39 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v45, v25 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v24 +; GFX9-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-NEXT: v_mov_b32_e32 v26, v61 +; GFX9-NEXT: v_mov_b32_e32 v61, v23 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v29, v33 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v30, v37 +; GFX9-NEXT: v_mov_b32_e32 v37, v27 +; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB87_3 +; GFX9-NEXT: .LBB87_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s5, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s29, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v44 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v35 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s5, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: .LBB87_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB87_4: +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v39 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v29, v33 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v8 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_mov_b32_e32 v16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v24 +; GFX9-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-NEXT: v_mov_b32_e32 v26, v61 +; GFX9-NEXT: v_mov_b32_e32 v30, v37 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_mov_b32_e32 v40, v3 +; GFX9-NEXT: v_mov_b32_e32 v63, v59 +; GFX9-NEXT: v_mov_b32_e32 v36, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_mov_b32_e32 v59, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v47 +; GFX9-NEXT: v_mov_b32_e32 v47, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v25 +; GFX9-NEXT: v_mov_b32_e32 v61, v23 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_mov_b32_e32 v37, v27 +; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB87_2 +; +; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v8f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v96, v97 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v98, v99 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v96, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v86 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB87_3 +; GFX11-TRUE16-NEXT: .LBB87_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v21, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB87_3: ; %end +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB87_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB87_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v8f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v38 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v85 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v69 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v96, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v96, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB87_3 +; GFX11-FAKE16-NEXT: .LBB87_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v32 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v38 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v84, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v85, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v82, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v68, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v69, v11 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v70, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v80, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v65, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v67, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v64, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v49 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v27, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v29, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v54, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v55, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v17, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v19, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v21, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v23, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB87_3: ; %end +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB87_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB87_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i16_to_v32f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_mov_b32_e32 v62, v30 -; GCN-NEXT: v_mov_b32_e32 v61, v29 -; GCN-NEXT: v_mov_b32_e32 v60, v28 -; GCN-NEXT: v_mov_b32_e32 v59, v27 -; GCN-NEXT: v_mov_b32_e32 v58, v26 -; GCN-NEXT: v_mov_b32_e32 v57, v25 -; GCN-NEXT: v_mov_b32_e32 v56, v24 -; GCN-NEXT: v_mov_b32_e32 v47, v23 -; GCN-NEXT: v_mov_b32_e32 v46, v22 -; GCN-NEXT: v_mov_b32_e32 v45, v21 -; GCN-NEXT: v_mov_b32_e32 v44, v20 -; GCN-NEXT: v_mov_b32_e32 v43, v19 -; GCN-NEXT: v_mov_b32_e32 v42, v18 -; GCN-NEXT: v_mov_b32_e32 v41, v17 -; GCN-NEXT: v_mov_b32_e32 v40, v16 -; GCN-NEXT: v_mov_b32_e32 v55, v15 -; GCN-NEXT: v_mov_b32_e32 v54, v14 -; GCN-NEXT: v_mov_b32_e32 v53, v13 -; GCN-NEXT: v_mov_b32_e32 v52, v12 -; GCN-NEXT: v_mov_b32_e32 v51, v11 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v49, v9 -; GCN-NEXT: v_mov_b32_e32 v48, v8 -; GCN-NEXT: v_mov_b32_e32 v39, v7 -; GCN-NEXT: v_mov_b32_e32 v38, v6 -; GCN-NEXT: v_mov_b32_e32 v37, v5 -; GCN-NEXT: v_mov_b32_e32 v36, v4 -; GCN-NEXT: v_mov_b32_e32 v35, v3 -; GCN-NEXT: v_mov_b32_e32 v34, v2 -; GCN-NEXT: v_mov_b32_e32 v33, v1 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB44_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v62 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: .LBB44_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB44_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: .LBB44_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i16_to_v32f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v62, v30 +; SI-NEXT: v_mov_b32_e32 v61, v29 +; SI-NEXT: v_mov_b32_e32 v60, v28 +; SI-NEXT: v_mov_b32_e32 v59, v27 +; SI-NEXT: v_mov_b32_e32 v58, v26 +; SI-NEXT: v_mov_b32_e32 v57, v25 +; SI-NEXT: v_mov_b32_e32 v56, v24 +; SI-NEXT: v_mov_b32_e32 v47, v23 +; SI-NEXT: v_mov_b32_e32 v46, v22 +; SI-NEXT: v_mov_b32_e32 v45, v21 +; SI-NEXT: v_mov_b32_e32 v44, v20 +; SI-NEXT: v_mov_b32_e32 v43, v19 +; SI-NEXT: v_mov_b32_e32 v42, v18 +; SI-NEXT: v_mov_b32_e32 v41, v17 +; SI-NEXT: v_mov_b32_e32 v40, v16 +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v53, v13 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: v_mov_b32_e32 v51, v11 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v49, v9 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v39, v7 +; SI-NEXT: v_mov_b32_e32 v38, v6 +; SI-NEXT: v_mov_b32_e32 v37, v5 +; SI-NEXT: v_mov_b32_e32 v36, v4 +; SI-NEXT: v_mov_b32_e32 v35, v3 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: .LBB88_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: .LBB88_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v32f16: ; VI: ; %bb.0: @@ -29762,7 +59930,7 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB44_2 +; VI-NEXT: s_cbranch_execz .LBB88_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v16, 3 ; VI-NEXT: v_add_u16_sdwa v19, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -29813,7 +59981,7 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v19 ; VI-NEXT: v_or_b32_e32 v1, v1, v18 ; VI-NEXT: v_or_b32_e32 v0, v0, v17 -; VI-NEXT: .LBB44_2: ; %end +; VI-NEXT: .LBB88_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -29824,7 +59992,7 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB44_2 +; GFX9-NEXT: s_cbranch_execz .LBB88_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -29842,7 +60010,7 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB44_2: ; %end +; GFX9-NEXT: .LBB88_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -29854,7 +60022,7 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-NEXT: s_cbranch_execz .LBB88_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -29872,7 +60040,7 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB44_2: ; %end +; GFX11-NEXT: .LBB88_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -29892,191 +60060,577 @@ end: ret <32 x half> %phi } +define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i16_to_v32f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v17 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: v_mov_b32_e32 v52, v15 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: v_mov_b32_e32 v50, v13 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_mov_b32_e32 v37, v8 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v34, v5 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB89_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v54 +; SI-NEXT: s_cbranch_execnz .LBB89_3 +; SI-NEXT: .LBB89_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: .LBB89_3: ; %end +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB89_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB89_2 +; +; VI-LABEL: bitcast_v32i16_to_v32f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: s_cbranch_scc0 .LBB89_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB89_3 +; VI-NEXT: .LBB89_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s17, 3 +; VI-NEXT: s_and_b32 s10, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s18, 3 +; VI-NEXT: s_and_b32 s12, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s19, 3 +; VI-NEXT: s_and_b32 s14, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s20, 3 +; VI-NEXT: s_and_b32 s16, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s21, 3 +; VI-NEXT: s_and_b32 s18, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s22, 3 +; VI-NEXT: s_and_b32 s20, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s23, 3 +; VI-NEXT: s_and_b32 s22, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s24, 3 +; VI-NEXT: s_and_b32 s24, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s40, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s41, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s42, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s43, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s44, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s45, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s7, s45, s7 +; VI-NEXT: s_or_b32 s6, s44, s6 +; VI-NEXT: s_or_b32 s29, s43, s29 +; VI-NEXT: s_or_b32 s28, s42, s28 +; VI-NEXT: s_or_b32 s27, s41, s27 +; VI-NEXT: s_or_b32 s26, s40, s26 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_add_i32 s25, s24, 0x30000 +; VI-NEXT: s_add_i32 s24, s22, 0x30000 +; VI-NEXT: s_add_i32 s23, s20, 0x30000 +; VI-NEXT: s_add_i32 s22, s18, 0x30000 +; VI-NEXT: s_add_i32 s21, s16, 0x30000 +; VI-NEXT: s_add_i32 s20, s14, 0x30000 +; VI-NEXT: s_add_i32 s19, s12, 0x30000 +; VI-NEXT: s_add_i32 s18, s10, 0x30000 +; VI-NEXT: s_add_i32 s17, s8, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB89_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB89_4: +; VI-NEXT: s_branch .LBB89_2 +; +; GFX9-LABEL: bitcast_v32i16_to_v32f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB89_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB89_3 +; GFX9-NEXT: .LBB89_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB89_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB89_4: +; GFX9-NEXT: s_branch .LBB89_2 +; +; GFX11-LABEL: bitcast_v32i16_to_v32f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB89_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB89_4 +; GFX11-NEXT: .LBB89_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB89_3: +; GFX11-NEXT: s_branch .LBB89_2 +; GFX11-NEXT: .LBB89_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f16_to_v32i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB45_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GCN-NEXT: v_or_b32_e32 v30, v30, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_or_b32_e32 v6, v6, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_or_b32_e32 v20, v20, v21 -; GCN-NEXT: v_or_b32_e32 v24, v24, v25 -; GCN-NEXT: v_or_b32_e32 v28, v28, v29 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; GCN-NEXT: .LBB45_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f16_to_v32i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB90_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_or_b32_e32 v26, v26, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v22, v22, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_or_b32_e32 v14, v14, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v10, v10, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: .LBB90_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f16_to_v32i16: ; VI: ; %bb.0: @@ -30085,7 +60639,7 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB45_2 +; VI-NEXT: s_cbranch_execz .LBB90_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v15 @@ -30136,7 +60690,7 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v19, v2 ; VI-NEXT: v_or_b32_e32 v1, v18, v1 ; VI-NEXT: v_or_b32_e32 v0, v16, v0 -; VI-NEXT: .LBB45_2: ; %end +; VI-NEXT: .LBB90_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -30147,7 +60701,7 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB45_2 +; GFX9-NEXT: s_cbranch_execz .LBB90_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -30166,7 +60720,7 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB45_2: ; %end +; GFX9-NEXT: .LBB90_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -30178,7 +60732,7 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB45_2 +; GFX11-NEXT: s_cbranch_execz .LBB90_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -30196,7 +60750,7 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB45_2: ; %end +; GFX11-NEXT: .LBB90_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -30216,206 +60770,584 @@ end: ret <32 x i16> %phi } +define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f16_to_v32i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB91_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB91_3 +; SI-NEXT: .LBB91_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_or_b32_e32 v26, v26, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v22, v22, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_or_b32_e32 v14, v14, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v10, v10, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: .LBB91_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB91_4: +; SI-NEXT: s_branch .LBB91_2 +; +; VI-LABEL: bitcast_v32f16_to_v32i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB91_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB91_3 +; VI-NEXT: .LBB91_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v17, 0x200 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v15 +; VI-NEXT: v_add_f16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v19, v15 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v14 +; VI-NEXT: v_add_f16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v19, v14 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v13 +; VI-NEXT: v_add_f16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v19, v13 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v12 +; VI-NEXT: v_add_f16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v19, v12 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v11 +; VI-NEXT: v_add_f16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v19, v11 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v10 +; VI-NEXT: v_add_f16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v19, v10 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v9 +; VI-NEXT: v_add_f16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v19, v9 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v19, v8 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v19, v7 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v19, v6 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v19, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v19, v4 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v17, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_or_b32_e32 v2, v19, v2 +; VI-NEXT: v_or_b32_e32 v1, v18, v1 +; VI-NEXT: v_or_b32_e32 v0, v16, v0 +; VI-NEXT: .LBB91_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB91_4: +; VI-NEXT: s_branch .LBB91_2 +; +; GFX9-LABEL: bitcast_v32f16_to_v32i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB91_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB91_3 +; GFX9-NEXT: .LBB91_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB91_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB91_4: +; GFX9-NEXT: s_branch .LBB91_2 +; +; GFX11-LABEL: bitcast_v32f16_to_v32i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB91_4 +; GFX11-NEXT: .LBB91_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB91_3: +; GFX11-NEXT: s_branch .LBB91_2 +; GFX11-NEXT: .LBB91_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i16_to_v32bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v55, v30 -; GCN-NEXT: v_mov_b32_e32 v54, v28 -; GCN-NEXT: v_mov_b32_e32 v53, v26 -; GCN-NEXT: v_mov_b32_e32 v52, v24 -; GCN-NEXT: v_mov_b32_e32 v51, v22 -; GCN-NEXT: v_mov_b32_e32 v50, v20 -; GCN-NEXT: v_mov_b32_e32 v49, v18 -; GCN-NEXT: v_mov_b32_e32 v48, v16 -; GCN-NEXT: v_mov_b32_e32 v39, v14 -; GCN-NEXT: v_mov_b32_e32 v38, v12 -; GCN-NEXT: v_mov_b32_e32 v37, v10 -; GCN-NEXT: v_mov_b32_e32 v36, v8 -; GCN-NEXT: v_mov_b32_e32 v35, v6 -; GCN-NEXT: v_mov_b32_e32 v34, v4 -; GCN-NEXT: v_mov_b32_e32 v33, v2 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB46_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB46_4 -; GCN-NEXT: .LBB46_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB46_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB46_2 -; GCN-NEXT: .LBB46_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v0, v31, v0 -; GCN-NEXT: v_or_b32_e32 v2, v29, v2 -; GCN-NEXT: v_or_b32_e32 v4, v27, v4 -; GCN-NEXT: v_or_b32_e32 v6, v25, v6 -; GCN-NEXT: v_or_b32_e32 v8, v23, v8 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v12, v19, v12 -; GCN-NEXT: v_or_b32_e32 v14, v17, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_or_b32_e32 v13, v13, v18 -; GCN-NEXT: v_or_b32_e32 v11, v11, v20 -; GCN-NEXT: v_or_b32_e32 v9, v9, v22 -; GCN-NEXT: v_or_b32_e32 v7, v7, v24 -; GCN-NEXT: v_or_b32_e32 v5, v5, v26 -; GCN-NEXT: v_or_b32_e32 v3, v3, v28 -; GCN-NEXT: v_or_b32_e32 v1, v1, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i16_to_v32bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: v_mov_b32_e32 v55, v28 +; SI-NEXT: v_mov_b32_e32 v54, v26 +; SI-NEXT: v_mov_b32_e32 v53, v24 +; SI-NEXT: v_mov_b32_e32 v52, v22 +; SI-NEXT: v_mov_b32_e32 v51, v20 +; SI-NEXT: v_mov_b32_e32 v50, v18 +; SI-NEXT: v_mov_b32_e32 v48, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v36, v8 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v34, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB92_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB92_4 +; SI-NEXT: .LBB92_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB92_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_2 +; SI-NEXT: .LBB92_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v32bf16: ; VI: ; %bb.0: @@ -30424,7 +61356,7 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB46_2 +; VI-NEXT: s_cbranch_execz .LBB92_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v16, 3 ; VI-NEXT: v_add_u16_sdwa v19, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -30475,7 +61407,7 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v19 ; VI-NEXT: v_or_b32_e32 v1, v1, v18 ; VI-NEXT: v_or_b32_e32 v0, v0, v17 -; VI-NEXT: .LBB46_2: ; %end +; VI-NEXT: .LBB92_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -30486,7 +61418,7 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB46_2 +; GFX9-NEXT: s_cbranch_execz .LBB92_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -30504,7 +61436,7 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB46_2: ; %end +; GFX9-NEXT: .LBB92_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -30516,7 +61448,7 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB46_2 +; GFX11-NEXT: s_cbranch_execz .LBB92_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -30534,7 +61466,7 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB46_2: ; %end +; GFX11-NEXT: .LBB92_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -30554,299 +61486,710 @@ end: ret <32 x bfloat> %phi } +define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i16_to_v32bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v17 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v16, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v20 +; SI-NEXT: s_cbranch_scc0 .LBB93_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s14, s16, 16 +; SI-NEXT: s_lshl_b32 s15, s17, 16 +; SI-NEXT: s_lshl_b32 s40, s18, 16 +; SI-NEXT: s_lshl_b32 s41, s19, 16 +; SI-NEXT: s_lshl_b32 s42, s20, 16 +; SI-NEXT: s_lshl_b32 s43, s21, 16 +; SI-NEXT: s_lshl_b32 s6, s22, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_lshl_b32 s8, s24, 16 +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_lshl_b32 s10, s26, 16 +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_lshl_b32 s12, s28, 16 +; SI-NEXT: s_lshl_b32 s13, s29, 16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v33 +; SI-NEXT: s_cbranch_execnz .LBB93_3 +; SI-NEXT: .LBB93_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s26, 0xffff +; SI-NEXT: s_lshl_b32 s6, s27, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xffff +; SI-NEXT: s_lshl_b32 s7, s25, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s8, s6, 0x30000 +; SI-NEXT: s_and_b32 s6, s22, 0xffff +; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s21, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s10, s19, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s16, 0xffff +; SI-NEXT: s_lshl_b32 s11, s17, 16 +; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: v_or_b32_e32 v3, v29, v3 +; SI-NEXT: v_or_b32_e32 v5, v27, v5 +; SI-NEXT: v_or_b32_e32 v7, v25, v7 +; SI-NEXT: v_or_b32_e32 v8, v23, v8 +; SI-NEXT: v_or_b32_e32 v6, v21, v6 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v2, v17, v2 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_and_b32 s15, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s10, 16 +; SI-NEXT: s_and_b32 s41, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s9, 16 +; SI-NEXT: s_and_b32 s43, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s7, 16 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s11, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s5, 16 +; SI-NEXT: s_and_b32 s13, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s4, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v6 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v1 +; SI-NEXT: .LBB93_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s14 +; SI-NEXT: v_mov_b32_e32 v1, s15 +; SI-NEXT: v_mov_b32_e32 v2, s40 +; SI-NEXT: v_mov_b32_e32 v3, s41 +; SI-NEXT: v_mov_b32_e32 v4, s42 +; SI-NEXT: v_mov_b32_e32 v5, s43 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v9, s9 +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: v_mov_b32_e32 v11, s11 +; SI-NEXT: v_mov_b32_e32 v12, s12 +; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB93_4: +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_branch .LBB93_2 +; +; VI-LABEL: bitcast_v32i16_to_v32bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: s_cbranch_scc0 .LBB93_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB93_3 +; VI-NEXT: .LBB93_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s17, 3 +; VI-NEXT: s_and_b32 s10, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s18, 3 +; VI-NEXT: s_and_b32 s12, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s19, 3 +; VI-NEXT: s_and_b32 s14, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s20, 3 +; VI-NEXT: s_and_b32 s16, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s21, 3 +; VI-NEXT: s_and_b32 s18, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s22, 3 +; VI-NEXT: s_and_b32 s20, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s23, 3 +; VI-NEXT: s_and_b32 s22, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s24, 3 +; VI-NEXT: s_and_b32 s24, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s40, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s41, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s42, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s43, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s44, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s45, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s7, s45, s7 +; VI-NEXT: s_or_b32 s6, s44, s6 +; VI-NEXT: s_or_b32 s29, s43, s29 +; VI-NEXT: s_or_b32 s28, s42, s28 +; VI-NEXT: s_or_b32 s27, s41, s27 +; VI-NEXT: s_or_b32 s26, s40, s26 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_add_i32 s25, s24, 0x30000 +; VI-NEXT: s_add_i32 s24, s22, 0x30000 +; VI-NEXT: s_add_i32 s23, s20, 0x30000 +; VI-NEXT: s_add_i32 s22, s18, 0x30000 +; VI-NEXT: s_add_i32 s21, s16, 0x30000 +; VI-NEXT: s_add_i32 s20, s14, 0x30000 +; VI-NEXT: s_add_i32 s19, s12, 0x30000 +; VI-NEXT: s_add_i32 s18, s10, 0x30000 +; VI-NEXT: s_add_i32 s17, s8, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB93_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB93_4: +; VI-NEXT: s_branch .LBB93_2 +; +; GFX9-LABEL: bitcast_v32i16_to_v32bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB93_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB93_3 +; GFX9-NEXT: .LBB93_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB93_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB93_4: +; GFX9-NEXT: s_branch .LBB93_2 +; +; GFX11-LABEL: bitcast_v32i16_to_v32bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB93_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB93_4 +; GFX11-NEXT: .LBB93_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB93_3: +; GFX11-NEXT: s_branch .LBB93_2 +; GFX11-NEXT: .LBB93_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v32bf16_to_v32i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v31 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB47_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v61 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v59 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB47_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB47_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v59 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v57 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v56 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v46 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v45 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v44 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v43 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v42 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v41 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v48 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v20 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v19 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v20 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v22 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; GCN-NEXT: v_and_b32_e32 v40, 0xffff0000, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v26 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v28 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_alignbit_b32 v0, v30, v0, 16 -; GCN-NEXT: v_alignbit_b32 v4, v36, v2, 16 -; GCN-NEXT: v_alignbit_b32 v8, v37, v32, 16 -; GCN-NEXT: v_alignbit_b32 v12, v38, v5, 16 -; GCN-NEXT: v_alignbit_b32 v16, v39, v33, 16 -; GCN-NEXT: v_alignbit_b32 v20, v48, v9, 16 -; GCN-NEXT: v_alignbit_b32 v24, v49, v10, 16 -; GCN-NEXT: v_alignbit_b32 v28, v50, v13, 16 -; GCN-NEXT: v_alignbit_b32 v30, v31, v14, 16 -; GCN-NEXT: v_alignbit_b32 v26, v27, v17, 16 -; GCN-NEXT: v_alignbit_b32 v22, v23, v18, 16 -; GCN-NEXT: v_alignbit_b32 v18, v19, v21, 16 -; GCN-NEXT: v_alignbit_b32 v14, v15, v34, 16 -; GCN-NEXT: v_alignbit_b32 v10, v11, v25, 16 -; GCN-NEXT: v_alignbit_b32 v6, v7, v35, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v29, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v41, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v40, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v55, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v54, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v53, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v52, 16 -; GCN-NEXT: v_alignbit_b32 v29, v30, v51, 16 -; GCN-NEXT: .LBB47_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32bf16_to_v32i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v30 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB94_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB94_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB94_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 +; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_alignbit_b32 v16, v9, v2, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_alignbit_b32 v20, v10, v2, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_alignbit_b32 v24, v11, v2, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_alignbit_b32 v28, v13, v2, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52 +; SI-NEXT: v_alignbit_b32 v30, v31, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v50 +; SI-NEXT: v_alignbit_b32 v26, v27, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v48 +; SI-NEXT: v_alignbit_b32 v22, v23, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v38 +; SI-NEXT: v_alignbit_b32 v18, v19, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v36 +; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 +; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: .LBB94_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v32i16: ; VI: ; %bb.0: @@ -30855,7 +62198,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB47_2 +; VI-NEXT: s_cbranch_execz .LBB94_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -31146,7 +62489,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 ; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 -; VI-NEXT: .LBB47_2: ; %end +; VI-NEXT: .LBB94_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -31157,7 +62500,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB47_2 +; GFX9-NEXT: s_cbranch_execz .LBB94_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -31401,7 +62744,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v2, v18, s6 ; GFX9-NEXT: v_perm_b32 v1, v1, v17, s6 ; GFX9-NEXT: v_perm_b32 v0, v0, v16, s6 -; GFX9-NEXT: .LBB47_2: ; %end +; GFX9-NEXT: .LBB94_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -31413,7 +62756,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v0 @@ -31709,7 +63052,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v18, 16, v20 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v3, 16, v19 -; GFX11-TRUE16-NEXT: .LBB47_2: ; %end +; GFX11-TRUE16-NEXT: .LBB94_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -31721,7 +63064,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v0 @@ -31971,7 +63314,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v29, v32, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v27, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB47_2: ; %end +; GFX11-FAKE16-NEXT: .LBB94_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -31991,810 +63334,2086 @@ end: ret <32 x i16> %phi } +define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32bf16_to_v32i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s17 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 +; SI-NEXT: s_cbranch_scc0 .LBB95_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v54 +; SI-NEXT: s_cbranch_execnz .LBB95_3 +; SI-NEXT: .LBB95_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60 +; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_alignbit_b32 v16, v9, v2, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_alignbit_b32 v20, v10, v2, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_alignbit_b32 v24, v11, v2, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_alignbit_b32 v28, v13, v2, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52 +; SI-NEXT: v_alignbit_b32 v30, v31, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v50 +; SI-NEXT: v_alignbit_b32 v26, v27, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v38 +; SI-NEXT: v_alignbit_b32 v22, v23, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v34 +; SI-NEXT: v_alignbit_b32 v18, v19, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 +; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 +; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: .LBB95_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB95_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB95_2 +; +; VI-LABEL: bitcast_v32bf16_to_v32i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_writelane_b32 v20, s31, 1 +; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s31, v1 +; VI-NEXT: s_cbranch_scc0 .LBB95_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB95_4 +; VI-NEXT: .LBB95_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_lshl_b32 s5, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s5, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s5, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_lshl_b32 s5, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s5, v1 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_and_b32 s5, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s5, v1 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc +; VI-NEXT: v_add_f32_e32 v16, s4, v1 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16 +; VI-NEXT: v_add_f32_e32 v16, s4, v1 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_add_f32_e32 v17, s4, v1 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 +; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; VI-NEXT: s_branch .LBB95_5 +; VI-NEXT: .LBB95_3: +; VI-NEXT: s_branch .LBB95_2 +; VI-NEXT: .LBB95_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: .LBB95_5: ; %end +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v32i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s31, v1 +; GFX9-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB95_4 +; GFX9-NEXT: .LBB95_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: s_and_b32 s5, s30, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v1, s5, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s5, s30, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s5, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b32 s5, s31, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s5, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s5, s31, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s5, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_and_or_b32 v14, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_and_or_b32 v15, v3, v16, v4 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v13, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v12, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v11, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v10, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v9, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v8, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v7, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v6, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v5, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v4, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v3, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v2, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_or_b32 v1, v1, v16, v17 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v17, v16, v0 +; GFX9-NEXT: s_branch .LBB95_5 +; GFX9-NEXT: .LBB95_3: +; GFX9-NEXT: s_branch .LBB95_2 +; GFX9-NEXT: .LBB95_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: .LBB95_5: ; %end +; GFX9-NEXT: v_readlane_b32 s31, v20, 1 +; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32bf16_to_v32i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB95_4 +; GFX11-NEXT: .LBB95_2: ; %cmp.true +; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s12, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s1, s13, 16 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-NEXT: s_and_b32 s2, s14, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-NEXT: s_lshl_b32 s1, s27, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v7 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v2, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v5 +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v3 +; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v7 +; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_lshl_b32 s0, s15, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v5, v6, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v4 +; GFX11-NEXT: v_bfe_u32 v7, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s16, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v10 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_and_b32 s0, s17, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v18, v6, v8 :: v_dual_add_nc_u32 v7, v9, v5 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v10 +; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8 +; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v7, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v11 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v6 +; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: s_lshl_b32 s0, s18, 16 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v12 +; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v7, v8, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v8, v10 :: v_dual_add_nc_u32 v8, 0x7fff, v9 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v11 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v7 +; GFX11-NEXT: v_bfe_u32 v12, v13, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s20, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v13 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v14, v9, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-NEXT: v_dual_cndmask_b32 v7, v10, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v9 +; GFX11-NEXT: v_bfe_u32 v14, v15, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v13, v14, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v10, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v12 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_lshl_b32 s0, s21, 16 +; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s0 +; GFX11-NEXT: v_dual_cndmask_b32 v24, v11, v12 :: v_dual_add_nc_u32 v9, 0x7fff, v13 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-NEXT: v_bfe_u32 v13, v23, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s22, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v23 +; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v15, v11, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v13 +; GFX11-NEXT: v_bfe_u32 v13, v25, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v26, v12, v14, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v12, v15, v11 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_bfe_u32 v28, v14, 16, 1 +; GFX11-NEXT: s_lshl_b32 s0, s23, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v10, v15, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v13 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v25 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v27, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v12, v28, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_add_f32_e64 v27, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v14 +; GFX11-NEXT: v_bfe_u32 v29, v15, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v13 :: v_dual_add_nc_u32 v12, 0x7fff, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: s_lshl_b32 s0, s24, 16 +; GFX11-NEXT: v_bfe_u32 v13, v27, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s25, 0xffff0000 +; GFX11-NEXT: v_dual_cndmask_b32 v25, v12, v28 :: v_dual_add_nc_u32 v12, v29, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v27 +; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v15 +; GFX11-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v27 +; GFX11-NEXT: v_bfe_u32 v32, v28, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v12, v12, v29 :: v_dual_add_nc_u32 v15, v30, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-NEXT: v_or_b32_e32 v30, 0x400000, v14 +; GFX11-NEXT: s_lshl_b32 s0, s25, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v13, v31, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v13, v32, v28 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v28 +; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v14, v15, v30 :: v_dual_add_nc_u32 v13, 0x7fff, v13 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s26, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, v32, v29 +; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v31, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s1 +; GFX11-NEXT: s_and_b32 s0, s27, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v28, v33, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v31 +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v30 +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v28, v28, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v29 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v35, v39, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_cndmask_b32_e32 v30, v34, v48, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v37, v49, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_and_or_b32 v7, 0xffff0000, v7, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v15, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_and_or_b32 v15, 0xffff0000, v32, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v38, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_and_or_b32 v12, 0xffff0000, v27, v31 +; GFX11-NEXT: v_and_or_b32 v11, 0xffff0000, v25, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX11-NEXT: v_and_or_b32 v14, 0xffff0000, v28, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX11-NEXT: v_and_or_b32 v9, 0xffff0000, v26, v23 +; GFX11-NEXT: v_and_or_b32 v8, 0xffff0000, v24, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX11-NEXT: v_and_or_b32 v6, 0xffff0000, v5, v27 +; GFX11-NEXT: v_and_or_b32 v5, 0xffff0000, v21, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX11-NEXT: v_and_or_b32 v13, 0xffff0000, v13, v29 +; GFX11-NEXT: v_and_or_b32 v10, 0xffff0000, v10, v33 +; GFX11-NEXT: v_and_or_b32 v4, 0xffff0000, v19, v20 +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v18, v21 +; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v17, v22 +; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v16, v23 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v24 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB95_3: +; GFX11-NEXT: s_branch .LBB95_2 +; GFX11-NEXT: .LBB95_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i16_to_v64i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v49 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB48_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v4 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v8 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v12 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v16 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v49 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_bfe_u32 v4, v4, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v4, v8, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_bfe_u32 v44, v12, 8, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v4, v16, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v4, v20, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v4, v24, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v4, v28, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v4, v49, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v52, v1, v46 -; GCN-NEXT: v_or_b32_e32 v51, v2, v47 -; GCN-NEXT: v_or_b32_e32 v48, v3, v57 -; GCN-NEXT: v_or_b32_e32 v37, v5, v59 -; GCN-NEXT: v_or_b32_e32 v32, v6, v60 -; GCN-NEXT: v_or_b32_e32 v6, v7, v61 -; GCN-NEXT: v_or_b32_e32 v58, v9, v62 -; GCN-NEXT: v_or_b32_e32 v56, v11, v63 -; GCN-NEXT: v_or_b32_e32 v42, v13, v10 -; GCN-NEXT: v_or_b32_e32 v54, v15, v14 -; GCN-NEXT: v_or_b32_e32 v50, v17, v22 -; GCN-NEXT: v_or_b32_e32 v39, v18, v33 -; GCN-NEXT: v_or_b32_e32 v36, v19, v26 -; GCN-NEXT: v_or_b32_e32 v31, v21, v34 -; GCN-NEXT: v_or_b32_e32 v18, v23, v30 -; GCN-NEXT: v_or_b32_e32 v2, v25, v35 -; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v48, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v48, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v48, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v55, v6, v32, 24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v6, v32, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v6, v32, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v41, v56, v58, 24 -; GCN-NEXT: v_alignbit_b32 v43, v56, v58, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v56, v58, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v50, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v50, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v45, v39, v50, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: .LBB48_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v49, v40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB48_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v29 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v25 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v2, v30, v2 -; GCN-NEXT: v_or_b32_e32 v4, v35, v4 -; GCN-NEXT: v_or_b32_e32 v6, v26, v6 -; GCN-NEXT: v_or_b32_e32 v8, v34, v8 -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: v_or_b32_e32 v16, v33, v16 -; GCN-NEXT: v_or_b32_e32 v10, v10, v17 -; GCN-NEXT: v_or_b32_e32 v14, v14, v18 -; GCN-NEXT: v_or_b32_e32 v13, v62, v13 -; GCN-NEXT: v_or_b32_e32 v15, v63, v15 -; GCN-NEXT: v_or_b32_e32 v9, v60, v9 -; GCN-NEXT: v_or_b32_e32 v11, v61, v11 -; GCN-NEXT: v_or_b32_e32 v5, v57, v5 -; GCN-NEXT: v_or_b32_e32 v7, v59, v7 -; GCN-NEXT: v_or_b32_e32 v1, v46, v1 -; GCN-NEXT: v_or_b32_e32 v3, v47, v3 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v36, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v31, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v39, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v42, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v54, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v58, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v56, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v32, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v51, vcc, s6, v3 -; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v4, v37, v48, 24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v48, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v8, v37, v48, 8 -; GCN-NEXT: v_alignbit_b32 v55, v6, v32, 24 -; GCN-NEXT: v_alignbit_b32 v20, v6, v32, 16 -; GCN-NEXT: v_alignbit_b32 v12, v6, v32, 8 -; GCN-NEXT: v_alignbit_b32 v41, v56, v58, 24 -; GCN-NEXT: v_alignbit_b32 v43, v56, v58, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v56, v58, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v50, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v50, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v45, v39, v50, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v51 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v24, 24, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v37 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v39 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: .LBB48_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v52 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v5, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v51 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v7, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v8 -; GCN-NEXT: v_or_b32_e32 v8, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v10, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v12 -; GCN-NEXT: v_or_b32_e32 v12, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v1 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v28 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v13, v1, v3 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v58 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: v_or_b32_e32 v14, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v24 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v56 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GCN-NEXT: v_or_b32_e32 v16, v4, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v55 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v42 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GCN-NEXT: v_or_b32_e32 v17, v6, v9 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v44 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v54 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GCN-NEXT: v_or_b32_e32 v21, v9, v11 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 24, v41 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v45 -; GCN-NEXT: v_or_b32_e32 v24, v11, v15 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v39 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GCN-NEXT: v_or_b32_e32 v22, v15, v19 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v19 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v19 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v36 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; GCN-NEXT: v_or_b32_e32 v36, v19, v25 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v49 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v31 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_or_b32_e32 v31, v25, v31 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v50 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v52, 24, v50 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v50 -; GCN-NEXT: v_or_b32_e32 v50, v18, v50 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v53 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v53, 24, v53 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v54 -; GCN-NEXT: v_or_b32_e32 v54, v2, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v56, 0xff, v40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v33, v33, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v41 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v41, 24, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v43, v27, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v58, 0xff, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v44, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GCN-NEXT: v_or_b32_e32 v29, v29, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v59, 0xff, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v60, 24, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v59 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v20, v20, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v23, v23, v38 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v30, v30, v48 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v35, v37, v51 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v37, v39, v55 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v38, v49, v42 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v39, v52, v45 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v48, v53, v46 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v49, v40, v47 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v51, v41, v56 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v52, v44, v57 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v54, v60, v58 -; GCN-NEXT: v_or_b32_e32 v5, v5, v33 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: v_or_b32_e32 v8, v8, v29 -; GCN-NEXT: v_or_b32_e32 v10, v10, v34 -; GCN-NEXT: v_or_b32_e32 v12, v12, v20 -; GCN-NEXT: v_or_b32_e32 v13, v13, v23 -; GCN-NEXT: v_or_b32_e32 v14, v14, v30 -; GCN-NEXT: v_or_b32_e32 v16, v16, v35 -; GCN-NEXT: v_or_b32_e32 v17, v17, v37 -; GCN-NEXT: v_or_b32_e32 v20, v21, v38 -; GCN-NEXT: v_or_b32_e32 v21, v24, v39 -; GCN-NEXT: v_or_b32_e32 v22, v22, v48 -; GCN-NEXT: v_or_b32_e32 v23, v36, v49 -; GCN-NEXT: v_or_b32_e32 v24, v31, v51 -; GCN-NEXT: v_or_b32_e32 v29, v50, v52 -; GCN-NEXT: v_or_b32_e32 v30, v53, v54 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i16_to_v64i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v30 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v44 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB96_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v56, v1, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v47, v1, v27 +; SI-NEXT: v_alignbit_b32 v1, v47, v56, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v47, v56, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v47, v56, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v50, v1, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v38, v1, v36 +; SI-NEXT: v_alignbit_b32 v1, v38, v50, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v50, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v50, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v34, v1, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v33, v1, v39 +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v32, v1, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v31, v1, v49 +; SI-NEXT: v_alignbit_b32 v1, v31, v32, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v31, v32, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v31, v32, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v30, v1, v53 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v26, v1, v52 +; SI-NEXT: v_alignbit_b32 v1, v26, v30, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v26, v30, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v26, v30, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v22, v1, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v18, v1, v54 +; SI-NEXT: v_alignbit_b32 v1, v18, v22, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v14, v1, v41 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v60, v18, v22, 24 +; SI-NEXT: v_alignbit_b32 v61, v18, v22, 16 +; SI-NEXT: v_bfe_u32 v62, v44, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v10, v1, v40 +; SI-NEXT: v_alignbit_b32 v1, v10, v14, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v6, v1, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v2, v1, v42 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v4, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v16, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v20, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v24, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v28, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; SI-NEXT: v_alignbit_b32 v57, v10, v14, 24 +; SI-NEXT: v_alignbit_b32 v58, v10, v14, 16 +; SI-NEXT: v_alignbit_b32 v45, v2, v6, 24 +; SI-NEXT: v_alignbit_b32 v46, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v59, v2, v6, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: .LBB96_2: ; %Flow +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB96_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v1 +; SI-NEXT: v_alignbit_b32 v1, v47, v56, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v47, v56, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v43, v2 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_alignbit_b32 v45, v2, v6, 24 +; SI-NEXT: v_alignbit_b32 v46, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v59, v2, v6, 8 +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v55, v4 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v52, v4 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v49, v4 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v50, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v50, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v50, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v31, v32, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_alignbit_b32 v4, v47, v56, 24 +; SI-NEXT: v_alignbit_b32 v24, v31, v32, 24 +; SI-NEXT: v_alignbit_b32 v28, v31, v32, 16 +; SI-NEXT: v_alignbit_b32 v12, v26, v30, 24 +; SI-NEXT: v_alignbit_b32 v16, v26, v30, 16 +; SI-NEXT: v_alignbit_b32 v44, v26, v30, 8 +; SI-NEXT: v_alignbit_b32 v60, v18, v22, 24 +; SI-NEXT: v_alignbit_b32 v61, v18, v22, 16 +; SI-NEXT: v_alignbit_b32 v20, v18, v22, 8 +; SI-NEXT: v_alignbit_b32 v57, v10, v14, 24 +; SI-NEXT: v_alignbit_b32 v58, v10, v14, 16 +; SI-NEXT: v_alignbit_b32 v8, v10, v14, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: .LBB96_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v64i8: ; VI: ; %bb.0: @@ -32901,7 +65520,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: s_cbranch_execz .LBB96_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill @@ -32979,9 +65598,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: .LBB48_2: ; %Flow +; VI-NEXT: .LBB96_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: s_cbranch_execz .LBB96_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v18, 3 ; VI-NEXT: v_add_u16_sdwa v26, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -33099,7 +65718,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: .LBB48_4: ; %end +; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v18 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24 @@ -33337,7 +65956,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB48_2 +; GFX9-NEXT: s_cbranch_execz .LBB96_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -33390,9 +66009,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: .LBB48_2: ; %Flow +; GFX9-NEXT: .LBB96_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB48_4 +; GFX9-NEXT: s_cbranch_execz .LBB96_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] @@ -33460,7 +66079,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB48_4: ; %end +; GFX9-NEXT: .LBB96_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 @@ -33620,7 +66239,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -33654,9 +66273,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB96_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] @@ -33706,7 +66325,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB48_4: ; %end +; GFX11-TRUE16-NEXT: .LBB96_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -33918,7 +66537,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -33968,9 +66587,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB96_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] @@ -34036,7 +66655,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB48_4: ; %end +; GFX11-FAKE16-NEXT: .LBB96_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -34206,780 +66825,3032 @@ end: ret <64 x i8> %phi } +define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i16_to_v64i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v37, s30, 0 +; SI-NEXT: v_writelane_b32 v37, s31, 1 +; SI-NEXT: v_writelane_b32 v37, s34, 2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v37, s35, 3 +; SI-NEXT: v_readfirstlane_b32 s34, v18 +; SI-NEXT: v_readfirstlane_b32 s35, v17 +; SI-NEXT: v_readfirstlane_b32 s30, v14 +; SI-NEXT: v_readfirstlane_b32 s31, v13 +; SI-NEXT: v_readfirstlane_b32 s94, v10 +; SI-NEXT: v_readfirstlane_b32 s95, v9 +; SI-NEXT: v_readfirstlane_b32 s92, v6 +; SI-NEXT: v_readfirstlane_b32 s93, v5 +; SI-NEXT: v_readfirstlane_b32 s90, v2 +; SI-NEXT: v_readfirstlane_b32 s91, v1 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 +; SI-NEXT: s_cbranch_scc0 .LBB97_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s14, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s15, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: v_mov_b32_e32 v1, s40 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: v_alignbit_b32 v18, s41, v1, 24 +; SI-NEXT: v_alignbit_b32 v25, s41, v1, 16 +; SI-NEXT: v_alignbit_b32 v30, s41, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s14 +; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: v_alignbit_b32 v19, s15, v1, 24 +; SI-NEXT: v_alignbit_b32 v26, s15, v1, 16 +; SI-NEXT: v_alignbit_b32 v31, s15, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s12 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s91, 0xffff +; SI-NEXT: s_lshl_b32 s5, s90, 16 +; SI-NEXT: v_alignbit_b32 v17, s13, v1, 24 +; SI-NEXT: v_alignbit_b32 v23, s13, v1, 16 +; SI-NEXT: v_alignbit_b32 v29, s13, v1, 8 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_alignbit_b32 v16, s11, v1, 24 +; SI-NEXT: v_alignbit_b32 v20, s11, v1, 16 +; SI-NEXT: v_alignbit_b32 v27, s11, v1, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: s_and_b32 s4, s93, 0xffff +; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: v_or_b32_e32 v5, v1, v33 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: s_and_b32 s4, s95, 0xffff +; SI-NEXT: s_lshl_b32 s5, s94, 16 +; SI-NEXT: v_or_b32_e32 v4, v1, v34 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: s_and_b32 s4, s31, 0xffff +; SI-NEXT: s_lshl_b32 s5, s30, 16 +; SI-NEXT: v_or_b32_e32 v2, v1, v35 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: s_and_b32 s4, s35, 0xffff +; SI-NEXT: s_lshl_b32 s5, s34, 16 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: v_alignbit_b32 v9, s9, v5, 24 +; SI-NEXT: v_alignbit_b32 v12, s9, v5, 16 +; SI-NEXT: v_alignbit_b32 v21, s9, v5, 8 +; SI-NEXT: v_alignbit_b32 v6, s8, v4, 24 +; SI-NEXT: v_alignbit_b32 v8, s8, v4, 16 +; SI-NEXT: v_alignbit_b32 v13, s8, v4, 8 +; SI-NEXT: v_alignbit_b32 v24, s7, v2, 24 +; SI-NEXT: v_alignbit_b32 v28, s7, v2, 16 +; SI-NEXT: v_alignbit_b32 v32, s7, v2, 8 +; SI-NEXT: v_alignbit_b32 v10, s6, v1, 24 +; SI-NEXT: v_alignbit_b32 v14, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v22, s6, v1, 8 +; SI-NEXT: s_lshr_b32 s78, s41, 8 +; SI-NEXT: s_lshr_b32 s75, s15, 8 +; SI-NEXT: s_lshr_b32 s72, s13, 8 +; SI-NEXT: s_lshr_b32 s61, s11, 8 +; SI-NEXT: s_lshr_b32 s58, s9, 8 +; SI-NEXT: s_lshr_b32 s47, s8, 8 +; SI-NEXT: s_lshr_b32 s45, s7, 8 +; SI-NEXT: s_lshr_b32 s42, s6, 8 +; SI-NEXT: s_and_b32 s88, s19, 0xffff +; SI-NEXT: s_and_b32 s77, s23, 0xffff +; SI-NEXT: s_and_b32 s74, s27, 0xffff +; SI-NEXT: s_and_b32 s63, s90, 0xffff +; SI-NEXT: s_and_b32 s60, s92, 0xffff +; SI-NEXT: s_and_b32 s57, s94, 0xffff +; SI-NEXT: s_and_b32 s46, s30, 0xffff +; SI-NEXT: s_and_b32 s43, s34, 0xffff +; SI-NEXT: s_bfe_u32 s89, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s79, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s76, s27, 0x80008 +; SI-NEXT: s_bfe_u32 s73, s90, 0x80008 +; SI-NEXT: s_bfe_u32 s62, s92, 0x80008 +; SI-NEXT: s_bfe_u32 s59, s94, 0x80008 +; SI-NEXT: s_bfe_u32 s56, s30, 0x80008 +; SI-NEXT: s_bfe_u32 s44, s34, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB97_3 +; SI-NEXT: .LBB97_2: ; %cmp.true +; SI-NEXT: s_add_i32 s35, s35, 3 +; SI-NEXT: s_and_b32 s4, s35, 0xffff +; SI-NEXT: s_lshl_b32 s5, s34, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s31, s31, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s31, 0xffff +; SI-NEXT: s_lshl_b32 s5, s30, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s95, s95, 3 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s95, 0xffff +; SI-NEXT: s_lshl_b32 s5, s94, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s93, s93, 3 +; SI-NEXT: s_add_i32 s8, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s93, 0xffff +; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s9, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s91, s91, 3 +; SI-NEXT: s_add_i32 s10, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s91, 0xffff +; SI-NEXT: s_lshl_b32 s5, s90, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s11, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s12, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s13, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s14, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s15, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: v_mov_b32_e32 v6, s40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_alignbit_b32 v18, s41, v6, 24 +; SI-NEXT: v_alignbit_b32 v25, s41, v6, 16 +; SI-NEXT: v_alignbit_b32 v30, s41, v6, 8 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v3, v33, v3 +; SI-NEXT: v_alignbit_b32 v19, s15, v6, 24 +; SI-NEXT: v_alignbit_b32 v26, s15, v6, 16 +; SI-NEXT: v_alignbit_b32 v31, s15, v6, 8 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_mov_b32_e32 v10, s7 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v3 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_alignbit_b32 v17, s13, v6, 24 +; SI-NEXT: v_alignbit_b32 v23, s13, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, s13, v6, 8 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_alignbit_b32 v16, s11, v6, 24 +; SI-NEXT: v_alignbit_b32 v20, s11, v6, 16 +; SI-NEXT: v_alignbit_b32 v27, s11, v6, 8 +; SI-NEXT: v_alignbit_b32 v9, v3, v5, 24 +; SI-NEXT: v_alignbit_b32 v12, v3, v5, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v5, 8 +; SI-NEXT: v_alignbit_b32 v6, v7, v4, 24 +; SI-NEXT: v_alignbit_b32 v8, v7, v4, 16 +; SI-NEXT: v_alignbit_b32 v13, v7, v4, 8 +; SI-NEXT: v_alignbit_b32 v24, v10, v2, 24 +; SI-NEXT: v_alignbit_b32 v28, v10, v2, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v2, 8 +; SI-NEXT: v_alignbit_b32 v10, v15, v1, 24 +; SI-NEXT: v_alignbit_b32 v14, v15, v1, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v1, 8 +; SI-NEXT: s_lshr_b32 s89, s41, 24 +; SI-NEXT: s_lshr_b32 s88, s41, 16 +; SI-NEXT: s_lshr_b32 s78, s41, 8 +; SI-NEXT: s_lshr_b32 s79, s15, 24 +; SI-NEXT: s_lshr_b32 s77, s15, 16 +; SI-NEXT: s_lshr_b32 s75, s15, 8 +; SI-NEXT: s_lshr_b32 s76, s13, 24 +; SI-NEXT: s_lshr_b32 s74, s13, 16 +; SI-NEXT: s_lshr_b32 s72, s13, 8 +; SI-NEXT: s_lshr_b32 s73, s11, 24 +; SI-NEXT: s_lshr_b32 s63, s11, 16 +; SI-NEXT: s_lshr_b32 s61, s11, 8 +; SI-NEXT: s_lshr_b32 s62, s9, 24 +; SI-NEXT: s_lshr_b32 s60, s9, 16 +; SI-NEXT: s_lshr_b32 s58, s9, 8 +; SI-NEXT: s_lshr_b32 s59, s8, 24 +; SI-NEXT: s_lshr_b32 s57, s8, 16 +; SI-NEXT: s_lshr_b32 s47, s8, 8 +; SI-NEXT: s_lshr_b32 s56, s7, 24 +; SI-NEXT: s_lshr_b32 s46, s7, 16 +; SI-NEXT: s_lshr_b32 s45, s7, 8 +; SI-NEXT: s_lshr_b32 s44, s6, 24 +; SI-NEXT: s_lshr_b32 s43, s6, 16 +; SI-NEXT: s_lshr_b32 s42, s6, 8 +; SI-NEXT: .LBB97_3: ; %end +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v30 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v25 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v18 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s89, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v31 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s77, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v19 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s14, s79, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v23 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s74, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v17 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s12, s76, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v27 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v20 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s63, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v16 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s10, s73, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s10, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v21 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v12 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v9 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s9, s62, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s9, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v13 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s47, 8 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s57, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s59, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s5, s45, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v28 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s46, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v24 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s56, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s43, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v10 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s44, 24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s35, v37, 3 +; SI-NEXT: v_readlane_b32 s34, v37, 2 +; SI-NEXT: v_readlane_b32 s31, v37, 1 +; SI-NEXT: v_readlane_b32 s30, v37, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB97_4: +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_branch .LBB97_2 +; +; VI-LABEL: bitcast_v32i16_to_v64i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v4, s30, 0 +; VI-NEXT: v_writelane_b32 v4, s31, 1 +; VI-NEXT: v_writelane_b32 v4, s34, 2 +; VI-NEXT: v_writelane_b32 v4, s35, 3 +; VI-NEXT: v_writelane_b32 v4, s36, 4 +; VI-NEXT: v_writelane_b32 v4, s37, 5 +; VI-NEXT: v_writelane_b32 v4, s38, 6 +; VI-NEXT: v_writelane_b32 v4, s39, 7 +; VI-NEXT: v_writelane_b32 v4, s48, 8 +; VI-NEXT: v_writelane_b32 v4, s49, 9 +; VI-NEXT: v_writelane_b32 v4, s50, 10 +; VI-NEXT: v_writelane_b32 v4, s51, 11 +; VI-NEXT: v_writelane_b32 v4, s52, 12 +; VI-NEXT: v_writelane_b32 v4, s53, 13 +; VI-NEXT: v_writelane_b32 v4, s54, 14 +; VI-NEXT: v_writelane_b32 v4, s55, 15 +; VI-NEXT: v_writelane_b32 v4, s64, 16 +; VI-NEXT: v_writelane_b32 v4, s65, 17 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_writelane_b32 v4, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: v_writelane_b32 v4, s67, 19 +; VI-NEXT: s_cbranch_scc0 .LBB97_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s59, s4, 16 +; VI-NEXT: s_lshr_b32 s60, s4, 8 +; VI-NEXT: s_lshr_b32 s61, s29, 24 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s29, 8 +; VI-NEXT: s_lshr_b32 s72, s28, 16 +; VI-NEXT: s_lshr_b32 s73, s28, 8 +; VI-NEXT: s_lshr_b32 s74, s27, 24 +; VI-NEXT: s_lshr_b32 s75, s27, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 8 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 8 +; VI-NEXT: s_lshr_b32 s79, s25, 24 +; VI-NEXT: s_lshr_b32 s88, s25, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 8 +; VI-NEXT: s_lshr_b32 s90, s24, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 8 +; VI-NEXT: s_lshr_b32 s30, s23, 24 +; VI-NEXT: s_lshr_b32 s31, s23, 16 +; VI-NEXT: s_lshr_b32 s34, s23, 8 +; VI-NEXT: s_lshr_b32 s35, s22, 16 +; VI-NEXT: s_lshr_b32 s36, s22, 8 +; VI-NEXT: s_lshr_b32 s37, s21, 24 +; VI-NEXT: s_lshr_b32 s38, s21, 16 +; VI-NEXT: s_lshr_b32 s39, s21, 8 +; VI-NEXT: s_lshr_b32 s48, s20, 16 +; VI-NEXT: s_lshr_b32 s49, s20, 8 +; VI-NEXT: s_lshr_b32 s50, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s55, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s65, s17, 8 +; VI-NEXT: s_lshr_b32 s66, s16, 16 +; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB97_3 +; VI-NEXT: .LBB97_2: ; %cmp.true +; VI-NEXT: s_and_b32 s6, s4, 0xffff0000 +; VI-NEXT: s_add_i32 s4, s4, 3 +; VI-NEXT: s_and_b32 s7, s5, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s5, 3 +; VI-NEXT: s_and_b32 s8, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s28, 3 +; VI-NEXT: s_and_b32 s10, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s29, 3 +; VI-NEXT: s_and_b32 s12, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s26, 3 +; VI-NEXT: s_and_b32 s14, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s27, 3 +; VI-NEXT: s_and_b32 s26, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s27, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s28, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_and_b32 s29, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_and_b32 s40, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s41, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_and_b32 s42, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s43, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s44, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s45, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s17, s45, s17 +; VI-NEXT: s_or_b32 s16, s44, s16 +; VI-NEXT: s_or_b32 s19, s43, s19 +; VI-NEXT: s_or_b32 s18, s42, s18 +; VI-NEXT: s_or_b32 s21, s41, s21 +; VI-NEXT: s_or_b32 s20, s40, s20 +; VI-NEXT: s_or_b32 s23, s29, s23 +; VI-NEXT: s_or_b32 s22, s28, s22 +; VI-NEXT: s_or_b32 s25, s27, s25 +; VI-NEXT: s_or_b32 s24, s26, s24 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s5, s7, s5 +; VI-NEXT: s_or_b32 s4, s6, s4 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s21, s21, 0x30000 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s23, 0x30000 +; VI-NEXT: s_add_i32 s22, s22, 0x30000 +; VI-NEXT: s_add_i32 s25, s25, 0x30000 +; VI-NEXT: s_add_i32 s24, s24, 0x30000 +; VI-NEXT: s_add_i32 s27, s14, 0x30000 +; VI-NEXT: s_add_i32 s26, s12, 0x30000 +; VI-NEXT: s_add_i32 s29, s10, 0x30000 +; VI-NEXT: s_add_i32 s28, s8, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s59, s4, 16 +; VI-NEXT: s_lshr_b32 s60, s4, 8 +; VI-NEXT: s_lshr_b32 s61, s29, 24 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s29, 8 +; VI-NEXT: s_lshr_b32 s72, s28, 16 +; VI-NEXT: s_lshr_b32 s73, s28, 8 +; VI-NEXT: s_lshr_b32 s74, s27, 24 +; VI-NEXT: s_lshr_b32 s75, s27, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 8 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 8 +; VI-NEXT: s_lshr_b32 s79, s25, 24 +; VI-NEXT: s_lshr_b32 s88, s25, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 8 +; VI-NEXT: s_lshr_b32 s90, s24, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 8 +; VI-NEXT: s_lshr_b32 s30, s23, 24 +; VI-NEXT: s_lshr_b32 s31, s23, 16 +; VI-NEXT: s_lshr_b32 s34, s23, 8 +; VI-NEXT: s_lshr_b32 s35, s22, 16 +; VI-NEXT: s_lshr_b32 s36, s22, 8 +; VI-NEXT: s_lshr_b32 s37, s21, 24 +; VI-NEXT: s_lshr_b32 s38, s21, 16 +; VI-NEXT: s_lshr_b32 s39, s21, 8 +; VI-NEXT: s_lshr_b32 s48, s20, 16 +; VI-NEXT: s_lshr_b32 s49, s20, 8 +; VI-NEXT: s_lshr_b32 s50, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s55, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s65, s17, 8 +; VI-NEXT: s_lshr_b32 s66, s16, 16 +; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: .LBB97_3: ; %end +; VI-NEXT: s_and_b32 s7, s16, 0xff +; VI-NEXT: s_lshl_b32 s9, s67, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s66, 0xff +; VI-NEXT: s_lshl_b32 s11, s44, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_and_b32 s7, s17, 0xff +; VI-NEXT: s_lshl_b32 s9, s65, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s64, 0xff +; VI-NEXT: s_lshl_b32 s11, s55, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s54, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s53, 0xff +; VI-NEXT: s_lshl_b32 s11, s42, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s19, 0xff +; VI-NEXT: s_lshl_b32 s9, s52, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s51, 0xff +; VI-NEXT: s_lshl_b32 s11, s50, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s49, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s48, 0xff +; VI-NEXT: s_lshl_b32 s11, s40, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s21, 0xff +; VI-NEXT: s_lshl_b32 s9, s39, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s38, 0xff +; VI-NEXT: s_lshl_b32 s11, s37, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s9, s36, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s35, 0xff +; VI-NEXT: s_lshl_b32 s11, s14, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s23, 0xff +; VI-NEXT: s_lshl_b32 s9, s34, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s31, 0xff +; VI-NEXT: s_lshl_b32 s11, s30, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_lshl_b32 s9, s91, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s90, 0xff +; VI-NEXT: s_lshl_b32 s11, s12, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s25, 0xff +; VI-NEXT: s_lshl_b32 s9, s89, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s88, 0xff +; VI-NEXT: s_lshl_b32 s11, s79, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s9, s78, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s77, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s27, 0xff +; VI-NEXT: s_lshl_b32 s9, s76, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s75, 0xff +; VI-NEXT: s_lshl_b32 s10, s74, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s9, s73, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s72, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s29, 0xff +; VI-NEXT: s_lshl_b32 s8, s63, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s62, 0xff +; VI-NEXT: s_lshl_b32 s9, s61, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s7, s60, 8 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s59, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: s_lshl_b32 s5, s58, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s57, 0xff +; VI-NEXT: s_lshl_b32 s6, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s67, v4, 19 +; VI-NEXT: v_readlane_b32 s66, v4, 18 +; VI-NEXT: v_readlane_b32 s65, v4, 17 +; VI-NEXT: v_readlane_b32 s64, v4, 16 +; VI-NEXT: v_readlane_b32 s55, v4, 15 +; VI-NEXT: v_readlane_b32 s54, v4, 14 +; VI-NEXT: v_readlane_b32 s53, v4, 13 +; VI-NEXT: v_readlane_b32 s52, v4, 12 +; VI-NEXT: v_readlane_b32 s51, v4, 11 +; VI-NEXT: v_readlane_b32 s50, v4, 10 +; VI-NEXT: v_readlane_b32 s49, v4, 9 +; VI-NEXT: v_readlane_b32 s48, v4, 8 +; VI-NEXT: v_readlane_b32 s39, v4, 7 +; VI-NEXT: v_readlane_b32 s38, v4, 6 +; VI-NEXT: v_readlane_b32 s37, v4, 5 +; VI-NEXT: v_readlane_b32 s36, v4, 4 +; VI-NEXT: v_readlane_b32 s35, v4, 3 +; VI-NEXT: v_readlane_b32 s34, v4, 2 +; VI-NEXT: v_readlane_b32 s31, v4, 1 +; VI-NEXT: v_readlane_b32 s30, v4, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB97_4: +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: s_branch .LBB97_2 +; +; GFX9-LABEL: bitcast_v32i16_to_v64i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v63, s30, 0 +; GFX9-NEXT: v_writelane_b32 v63, s31, 1 +; GFX9-NEXT: v_writelane_b32 v63, s34, 2 +; GFX9-NEXT: v_writelane_b32 v63, s35, 3 +; GFX9-NEXT: v_writelane_b32 v63, s36, 4 +; GFX9-NEXT: v_writelane_b32 v63, s37, 5 +; GFX9-NEXT: v_writelane_b32 v63, s38, 6 +; GFX9-NEXT: v_writelane_b32 v63, s39, 7 +; GFX9-NEXT: v_writelane_b32 v63, s48, 8 +; GFX9-NEXT: v_writelane_b32 v63, s49, 9 +; GFX9-NEXT: v_writelane_b32 v63, s50, 10 +; GFX9-NEXT: v_writelane_b32 v63, s51, 11 +; GFX9-NEXT: v_writelane_b32 v63, s52, 12 +; GFX9-NEXT: v_writelane_b32 v63, s53, 13 +; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s59, s5, 8 +; GFX9-NEXT: s_lshr_b32 s58, s4, 16 +; GFX9-NEXT: s_lshr_b32 s60, s4, 8 +; GFX9-NEXT: s_lshr_b32 s61, s29, 24 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s72, s29, 8 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s73, s28, 8 +; GFX9-NEXT: s_lshr_b32 s74, s27, 24 +; GFX9-NEXT: s_lshr_b32 s75, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s27, 8 +; GFX9-NEXT: s_lshr_b32 s76, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s26, 8 +; GFX9-NEXT: s_lshr_b32 s79, s25, 24 +; GFX9-NEXT: s_lshr_b32 s88, s25, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 8 +; GFX9-NEXT: s_lshr_b32 s89, s24, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 8 +; GFX9-NEXT: s_lshr_b32 s92, s23, 24 +; GFX9-NEXT: s_lshr_b32 s93, s23, 16 +; GFX9-NEXT: s_lshr_b32 s95, s23, 8 +; GFX9-NEXT: s_lshr_b32 s94, s22, 16 +; GFX9-NEXT: s_lshr_b32 s30, s22, 8 +; GFX9-NEXT: s_lshr_b32 s31, s21, 24 +; GFX9-NEXT: s_lshr_b32 s34, s21, 16 +; GFX9-NEXT: s_lshr_b32 s36, s21, 8 +; GFX9-NEXT: s_lshr_b32 s35, s20, 16 +; GFX9-NEXT: s_lshr_b32 s37, s20, 8 +; GFX9-NEXT: s_lshr_b32 s38, s19, 24 +; GFX9-NEXT: s_lshr_b32 s39, s19, 16 +; GFX9-NEXT: s_lshr_b32 s49, s19, 8 +; GFX9-NEXT: s_lshr_b32 s48, s18, 16 +; GFX9-NEXT: s_lshr_b32 s50, s18, 8 +; GFX9-NEXT: s_lshr_b32 s51, s17, 24 +; GFX9-NEXT: s_lshr_b32 s52, s17, 16 +; GFX9-NEXT: s_lshr_b32 s54, s17, 8 +; GFX9-NEXT: s_lshr_b32 s53, s16, 16 +; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB97_4 +; GFX9-NEXT: .LBB97_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v6, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] +; GFX9-NEXT: v_pk_add_u16 v10, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8] +; GFX9-NEXT: v_pk_add_u16 v12, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] +; GFX9-NEXT: v_pk_add_u16 v16, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] +; GFX9-NEXT: v_pk_add_u16 v20, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX9-NEXT: s_branch .LBB97_5 +; GFX9-NEXT: .LBB97_3: +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: s_branch .LBB97_2 +; GFX9-NEXT: .LBB97_4: +; GFX9-NEXT: v_mov_b32_e32 v21, s44 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v21, s42 +; GFX9-NEXT: v_mov_b32_e32 v19, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v11, s20 +; GFX9-NEXT: v_mov_b32_e32 v12, s21 +; GFX9-NEXT: v_mov_b32_e32 v9, s22 +; GFX9-NEXT: v_mov_b32_e32 v10, s23 +; GFX9-NEXT: v_mov_b32_e32 v7, s24 +; GFX9-NEXT: v_mov_b32_e32 v8, s25 +; GFX9-NEXT: v_mov_b32_e32 v5, s26 +; GFX9-NEXT: v_mov_b32_e32 v6, s27 +; GFX9-NEXT: v_mov_b32_e32 v3, s28 +; GFX9-NEXT: v_mov_b32_e32 v4, s29 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v17, s55 +; GFX9-NEXT: v_mov_b32_e32 v62, s53 +; GFX9-NEXT: v_mov_b32_e32 v13, s54 +; GFX9-NEXT: v_mov_b32_e32 v60, s52 +; GFX9-NEXT: v_mov_b32_e32 v61, s51 +; GFX9-NEXT: v_mov_b32_e32 v58, s50 +; GFX9-NEXT: v_mov_b32_e32 v59, s48 +; GFX9-NEXT: v_mov_b32_e32 v57, s49 +; GFX9-NEXT: v_mov_b32_e32 v47, s39 +; GFX9-NEXT: v_mov_b32_e32 v56, s38 +; GFX9-NEXT: v_mov_b32_e32 v46, s37 +; GFX9-NEXT: v_mov_b32_e32 v45, s35 +; GFX9-NEXT: v_mov_b32_e32 v44, s36 +; GFX9-NEXT: v_mov_b32_e32 v42, s34 +; GFX9-NEXT: v_mov_b32_e32 v43, s31 +; GFX9-NEXT: v_mov_b32_e32 v41, s30 +; GFX9-NEXT: v_mov_b32_e32 v40, s94 +; GFX9-NEXT: v_mov_b32_e32 v55, s95 +; GFX9-NEXT: v_mov_b32_e32 v53, s93 +; GFX9-NEXT: v_mov_b32_e32 v54, s92 +; GFX9-NEXT: v_mov_b32_e32 v52, s91 +; GFX9-NEXT: v_mov_b32_e32 v51, s89 +; GFX9-NEXT: v_mov_b32_e32 v50, s90 +; GFX9-NEXT: v_mov_b32_e32 v48, s88 +; GFX9-NEXT: v_mov_b32_e32 v49, s79 +; GFX9-NEXT: v_mov_b32_e32 v39, s78 +; GFX9-NEXT: v_mov_b32_e32 v38, s76 +; GFX9-NEXT: v_mov_b32_e32 v37, s77 +; GFX9-NEXT: v_mov_b32_e32 v35, s75 +; GFX9-NEXT: v_mov_b32_e32 v36, s74 +; GFX9-NEXT: v_mov_b32_e32 v34, s73 +; GFX9-NEXT: v_mov_b32_e32 v33, s63 +; GFX9-NEXT: v_mov_b32_e32 v32, s72 +; GFX9-NEXT: v_mov_b32_e32 v30, s62 +; GFX9-NEXT: v_mov_b32_e32 v31, s61 +; GFX9-NEXT: v_mov_b32_e32 v29, s60 +; GFX9-NEXT: v_mov_b32_e32 v28, s58 +; GFX9-NEXT: v_mov_b32_e32 v27, s59 +; GFX9-NEXT: v_mov_b32_e32 v14, s57 +; GFX9-NEXT: v_mov_b32_e32 v18, s56 +; GFX9-NEXT: v_mov_b32_e32 v23, s12 +; GFX9-NEXT: v_mov_b32_e32 v24, s10 +; GFX9-NEXT: v_mov_b32_e32 v25, s8 +; GFX9-NEXT: v_mov_b32_e32 v26, s6 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v21, s40 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: .LBB97_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v26 +; GFX9-NEXT: v_or_b32_sdwa v19, v62, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v13, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v13, v59, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v57 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v12, v42, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v10, v53, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v9, v51, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX9-NEXT: v_or_b32_sdwa v8, v48, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v6, v35, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s55, v63, 15 +; GFX9-NEXT: v_readlane_b32 s54, v63, 14 +; GFX9-NEXT: v_readlane_b32 s53, v63, 13 +; GFX9-NEXT: v_readlane_b32 s52, v63, 12 +; GFX9-NEXT: v_readlane_b32 s51, v63, 11 +; GFX9-NEXT: v_readlane_b32 s50, v63, 10 +; GFX9-NEXT: v_readlane_b32 s49, v63, 9 +; GFX9-NEXT: v_readlane_b32 s48, v63, 8 +; GFX9-NEXT: v_readlane_b32 s39, v63, 7 +; GFX9-NEXT: v_readlane_b32 s38, v63, 6 +; GFX9-NEXT: v_readlane_b32 s37, v63, 5 +; GFX9-NEXT: v_readlane_b32 s36, v63, 4 +; GFX9-NEXT: v_readlane_b32 s35, v63, 3 +; GFX9-NEXT: v_readlane_b32 s34, v63, 2 +; GFX9-NEXT: v_readlane_b32 s31, v63, 1 +; GFX9-NEXT: v_readlane_b32 s30, v63, 0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v32i16_to_v64i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 10 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB97_4 +; GFX11-TRUE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, s17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: s_branch .LBB97_5 +; GFX11-TRUE16-NEXT: .LBB97_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr95_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB97_2 +; GFX11-TRUE16-NEXT: .LBB97_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, s31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s30 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, s95 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s93 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, s92 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s91 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s90 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s89 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s88 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s79 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s78 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s77 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s76 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s75 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s74 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s73 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s72 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s61 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s59 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s57 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s47 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s4 +; GFX11-TRUE16-NEXT: .LBB97_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xff, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v87, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xff, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, v85, v84 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v82, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v22, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, v80, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v17, v70 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v69, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v67, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v21, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v22, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v17, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v18, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v18, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v23, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v21, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v23, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v38, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v68, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v14, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v9, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v10, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v7, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v18, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v18, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v7, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v6 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[66:69], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v40, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v40, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v32i16_to_v64i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s42, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB97_4 +; GFX11-FAKE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v23 +; GFX11-FAKE16-NEXT: s_branch .LBB97_5 +; GFX11-FAKE16-NEXT: .LBB97_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: s_branch .LBB97_2 +; GFX11-FAKE16-NEXT: .LBB97_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s0 :: v_dual_mov_b32 v24, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v20, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s16 :: v_dual_mov_b32 v16, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s18 :: v_dual_mov_b32 v14, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s22 :: v_dual_mov_b32 v6, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s24 :: v_dual_mov_b32 v4, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s26 :: v_dual_mov_b32 v2, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s49 :: v_dual_mov_b32 v87, s39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s48 :: v_dual_mov_b32 v85, s38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, s37 :: v_dual_mov_b32 v83, s36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s34 :: v_dual_mov_b32 v81, s35 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s31 :: v_dual_mov_b32 v71, s30 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, vcc_hi :: v_dual_mov_b32 v69, s94 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s95 :: v_dual_mov_b32 v67, s93 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s92 :: v_dual_mov_b32 v65, s91 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s89 :: v_dual_mov_b32 v55, s90 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s88 :: v_dual_mov_b32 v53, s79 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s78 :: v_dual_mov_b32 v51, s76 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s77 :: v_dual_mov_b32 v49, s75 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s74 :: v_dual_mov_b32 v39, s73 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s63 :: v_dual_mov_b32 v37, s72 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s62 :: v_dual_mov_b32 v35, s61 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s60 :: v_dual_mov_b32 v33, s58 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s59 :: v_dual_mov_b32 v31, s57 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s56 :: v_dual_mov_b32 v29, s47 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v7, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s46 :: v_dual_mov_b32 v11, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s44 :: v_dual_mov_b32 v17, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s43 :: v_dual_mov_b32 v21, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s10 :: v_dual_mov_b32 v26, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s6 :: v_dual_mov_b32 v28, s4 +; GFX11-FAKE16-NEXT: .LBB97_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xff, v87 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xff, v85 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v87, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, v23, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v85, v84 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v87, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v69, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v67, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v83, v24, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v84, v19, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, v20, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v15, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v16, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v28, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v13, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v14, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v9, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v10, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v5, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v17, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v17, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v8 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[82:85], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v40, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i8_to_v32i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v13 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v15 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v11 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v21 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v23 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v19 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v29 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v27 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v33 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v57, 8, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v20 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v28 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v56 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v46 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v47 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v62 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v61 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v41 -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v63 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v40 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v58 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v59 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v60 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v42 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v21, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v23 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v25, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v27 -; GCN-NEXT: v_or_b32_e32 v23, v28, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v30 -; GCN-NEXT: v_or_b32_e32 v24, v24, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v32 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v33, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v34, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v35, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v32 -; GCN-NEXT: v_or_b32_e32 v15, v15, v44 -; GCN-NEXT: v_or_b32_e32 v16, v16, v45 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v40, v32, v11 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v11, v8 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v11, v9 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v11, v6 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v41, v11, v7 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v7, v0 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v7, v2 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v17 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v11, v18 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v11, v19 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v20 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v19, v21 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v19, v22 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v25 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v23, v26 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v27 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v28 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v33, v1, v10 -; GCN-NEXT: v_or_b32_e32 v35, v3, v8 -; GCN-NEXT: v_or_b32_e32 v37, v4, v6 -; GCN-NEXT: v_or_b32_e32 v39, v5, v0 -; GCN-NEXT: v_or_b32_e32 v49, v7, v17 -; GCN-NEXT: v_or_b32_e32 v51, v11, v20 -; GCN-NEXT: v_or_b32_e32 v53, v19, v22 -; GCN-NEXT: v_or_b32_e32 v55, v23, v24 -; GCN-NEXT: v_or_b32_e32 v32, v27, v40 -; GCN-NEXT: v_or_b32_e32 v34, v28, v9 -; GCN-NEXT: v_or_b32_e32 v36, v29, v41 -; GCN-NEXT: v_or_b32_e32 v38, v12, v2 -; GCN-NEXT: v_or_b32_e32 v48, v13, v18 -; GCN-NEXT: v_or_b32_e32 v50, v14, v21 -; GCN-NEXT: v_or_b32_e32 v52, v15, v25 -; GCN-NEXT: v_or_b32_e32 v54, v16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_alignbit_b32 v1, v33, v40, 16 -; GCN-NEXT: v_alignbit_b32 v5, v35, v9, 16 -; GCN-NEXT: v_alignbit_b32 v9, v37, v41, 16 -; GCN-NEXT: v_alignbit_b32 v13, v39, v2, 16 -; GCN-NEXT: v_alignbit_b32 v17, v49, v18, 16 -; GCN-NEXT: v_alignbit_b32 v21, v51, v21, 16 -; GCN-NEXT: v_alignbit_b32 v25, v53, v25, 16 -; GCN-NEXT: v_alignbit_b32 v29, v55, v26, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: .LBB49_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v45, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v24 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v57, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v42 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v44, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v41 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v58 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v46 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v30 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v26, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v27, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v31, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v32, v2 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v32, v0 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v32, v11 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v32, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v32, v17 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s7, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v32, v30 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v1, v31, v1 -; GCN-NEXT: v_or_b32_e32 v3, v8, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v0, v0, v7 -; GCN-NEXT: v_or_b32_e32 v4, v6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v9 -; GCN-NEXT: v_or_b32_e32 v6, v15, v13 -; GCN-NEXT: v_or_b32_e32 v7, v17, v16 -; GCN-NEXT: v_or_b32_e32 v8, v21, v19 -; GCN-NEXT: v_or_b32_e32 v9, v24, v23 -; GCN-NEXT: v_or_b32_e32 v11, v18, v25 -; GCN-NEXT: v_or_b32_e32 v13, v22, v20 -; GCN-NEXT: v_or_b32_e32 v10, v10, v26 -; GCN-NEXT: v_or_b32_e32 v12, v14, v12 -; GCN-NEXT: v_or_b32_e32 v14, v28, v27 -; GCN-NEXT: v_or_b32_e32 v15, v30, v29 -; GCN-NEXT: v_add_i32_e32 v54, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v55, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v53, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v51, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v49, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v38, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v39, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v36, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v34, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v35, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v32, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v33, vcc, s6, v15 -; GCN-NEXT: v_alignbit_b32 v1, v33, v32, 16 -; GCN-NEXT: v_alignbit_b32 v5, v35, v34, 16 -; GCN-NEXT: v_alignbit_b32 v9, v37, v36, 16 -; GCN-NEXT: v_alignbit_b32 v13, v39, v38, 16 -; GCN-NEXT: v_alignbit_b32 v17, v49, v48, 16 -; GCN-NEXT: v_alignbit_b32 v21, v51, v50, 16 -; GCN-NEXT: v_alignbit_b32 v25, v53, v52, 16 -; GCN-NEXT: v_alignbit_b32 v29, v55, v54, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; GCN-NEXT: .LBB49_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v32 -; GCN-NEXT: v_mov_b32_e32 v2, v33 -; GCN-NEXT: v_mov_b32_e32 v4, v34 -; GCN-NEXT: v_mov_b32_e32 v6, v35 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mov_b32_e32 v8, v36 -; GCN-NEXT: v_mov_b32_e32 v10, v37 -; GCN-NEXT: v_mov_b32_e32 v12, v38 -; GCN-NEXT: v_mov_b32_e32 v14, v39 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mov_b32_e32 v16, v48 -; GCN-NEXT: v_mov_b32_e32 v18, v49 -; GCN-NEXT: v_mov_b32_e32 v20, v50 -; GCN-NEXT: v_mov_b32_e32 v22, v51 -; GCN-NEXT: v_mov_b32_e32 v24, v52 -; GCN-NEXT: v_mov_b32_e32 v26, v53 -; GCN-NEXT: v_mov_b32_e32 v28, v54 -; GCN-NEXT: v_mov_b32_e32 v30, v55 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i8_to_v32i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:104 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v17 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v31 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v32 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v33 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v34 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v35 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v36 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v37 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v39 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v48 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v49 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v50 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v38 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v48, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v11, v9, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v7, v9 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v7, v1 +; SI-NEXT: v_or_b32_e32 v49, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v48, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v15, v13, v9 +; SI-NEXT: v_alignbit_b32 v9, v49, v15, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v19, v13, v9 +; SI-NEXT: v_or_b32_e32 v50, v8, v19 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v9, v50, v0, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v16, v8 +; SI-NEXT: v_or_b32_e32 v51, v6, v8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v22, v16 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v22, v22, v60 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v14, v6 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v57 +; SI-NEXT: v_or_b32_e32 v14, v14, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v52, v14, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v13, v51, v6, 16 +; SI-NEXT: v_alignbit_b32 v17, v52, v14, 16 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v18, v18, v26 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_or_b32_e32 v53, v18, v10 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v42, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v24, v58, v23 +; SI-NEXT: v_or_b32_e32 v55, v22, v24 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v18, v44 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v54, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v56, v18 +; SI-NEXT: v_alignbit_b32 v21, v53, v2, 16 +; SI-NEXT: v_alignbit_b32 v25, v54, v18, 16 +; SI-NEXT: v_alignbit_b32 v29, v55, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v12, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v37, v12, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v35, v11, v15 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v63 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v33, v11, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v32, v0, v6 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v34, v0, v14 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v36, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v38, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v39, v0, v5 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB98_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v55, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_alignbit_b32 v29, v55, v39, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v38, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v40 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v54, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v25, v54, v38, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v54 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v36, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v10, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v53, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_alignbit_b32 v21, v53, v36, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v43 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v22, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v52, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v17, v52, v34, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v51, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v13, v51, v32, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_add_i32_e32 v33, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v50, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v9, v50, v33, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v50 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v49, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v49 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v37, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v48, vcc, s7, v0 +; SI-NEXT: v_alignbit_b32 v1, v48, v37, 16 +; SI-NEXT: v_alignbit_b32 v0, v49, v35, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: .LBB98_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v37 +; SI-NEXT: v_mov_b32_e32 v2, v48 +; SI-NEXT: v_mov_b32_e32 v4, v35 +; SI-NEXT: v_mov_b32_e32 v6, v49 +; SI-NEXT: v_mov_b32_e32 v8, v33 +; SI-NEXT: v_mov_b32_e32 v10, v50 +; SI-NEXT: v_mov_b32_e32 v12, v32 +; SI-NEXT: v_mov_b32_e32 v14, v51 +; SI-NEXT: v_mov_b32_e32 v16, v34 +; SI-NEXT: v_mov_b32_e32 v18, v52 +; SI-NEXT: v_mov_b32_e32 v20, v36 +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v24, v38 +; SI-NEXT: v_mov_b32_e32 v26, v54 +; SI-NEXT: v_mov_b32_e32 v28, v39 +; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v32i16: ; VI: ; %bb.0: @@ -35093,7 +69964,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB49_2 +; VI-NEXT: s_cbranch_execz .LBB98_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -35256,9 +70127,9 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: .LBB49_2: ; %Flow +; VI-NEXT: .LBB98_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB49_4 +; VI-NEXT: s_cbranch_execz .LBB98_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_add_u16_e32 v0, 3, v38 @@ -35413,7 +70284,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: v_add_u16_e32 v3, 0x300, v23 ; VI-NEXT: v_or_b32_e32 v3, v3, v19 -; VI-NEXT: .LBB49_4: ; %end +; VI-NEXT: .LBB98_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -35557,7 +70428,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB49_2 +; GFX9-NEXT: s_cbranch_execz .LBB98_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -35722,9 +70593,9 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: .LBB49_2: ; %Flow +; GFX9-NEXT: .LBB98_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB49_4 +; GFX9-NEXT: s_cbranch_execz .LBB98_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 @@ -35877,7 +70748,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_perm_b32 v0, v31, v0, s6 -; GFX9-NEXT: .LBB49_4: ; %end +; GFX9-NEXT: .LBB98_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -36008,15 +70879,15 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_4 -; GFX11-TRUE16-NEXT: .LBB49_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_4 +; GFX11-TRUE16-NEXT: .LBB98_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB49_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.l @@ -36147,8 +71018,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-TRUE16-NEXT: .LBB49_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 @@ -36378,15 +71249,15 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB98_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_4 -; GFX11-FAKE16-NEXT: .LBB49_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB98_4 +; GFX11-FAKE16-NEXT: .LBB98_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB49_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v38 @@ -36533,8 +71404,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-FAKE16-NEXT: .LBB49_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-FAKE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v70, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3 @@ -36704,315 +71575,2427 @@ end: ret <32 x i16> %phi } +define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i8_to_v32i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 +; SI-NEXT: v_readfirstlane_b32 s15, v27 +; SI-NEXT: v_readfirstlane_b32 s40, v26 +; SI-NEXT: v_readfirstlane_b32 s12, v19 +; SI-NEXT: v_readfirstlane_b32 s13, v18 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v10 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_readfirstlane_b32 s9, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s44, v31 +; SI-NEXT: v_readfirstlane_b32 s45, v32 +; SI-NEXT: v_readfirstlane_b32 s42, v33 +; SI-NEXT: v_readfirstlane_b32 s43, v34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v35 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v36 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v37 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v38 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v48 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v59 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v61 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v62 +; SI-NEXT: s_cbranch_scc0 .LBB99_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s14, s23, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s14, s19, 24 +; SI-NEXT: s_or_b32 s4, s14, s4 +; SI-NEXT: s_and_b32 s14, s28, 0xff +; SI-NEXT: s_lshl_b32 s46, s29, 8 +; SI-NEXT: s_or_b32 s14, s14, s46 +; SI-NEXT: s_and_b32 s46, s6, 0xff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_lshl_b32 s47, s7, 24 +; SI-NEXT: s_or_b32 s57, s47, s46 +; SI-NEXT: s_and_b32 s46, s26, 0xff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_lshl_b32 s47, s27, 24 +; SI-NEXT: s_or_b32 s46, s47, s46 +; SI-NEXT: s_and_b32 s47, s16, 0xff +; SI-NEXT: s_lshl_b32 s56, s17, 8 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s47, s47, 0xffff +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v8 +; SI-NEXT: s_or_b32 s47, s47, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s56, s25, 8 +; SI-NEXT: v_or_b32_e32 v9, v9, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v11, v2, v10 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v5, s46 +; SI-NEXT: v_or_b32_e32 v10, v9, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v16 +; SI-NEXT: s_or_b32 s46, s4, s46 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s56, s8, 8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v13, v13, v49 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: v_or_b32_e32 v15, v3, v9 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v19, v7, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v36, v13, v19 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v35, s4, v15 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s56, s10, 8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v17, v17, v53 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: v_or_b32_e32 v23, v51, v13 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v27, v52, v18 +; SI-NEXT: v_or_b32_e32 v62, v47, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v41 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v18, v17, v27 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v37, s4, v23 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s56, s12, 8 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v21, v21, v43 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v33, v58, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v34, 0xff, v45 +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: v_or_b32_e32 v25, v54, v17 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v31, v42, v26 +; SI-NEXT: v_or_b32_e32 v32, v32, v60 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v38, v21, v31 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v63, v59, v34 +; SI-NEXT: v_or_b32_e32 v39, s4, v25 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s56, s15, 8 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v48, v32, v63 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v56 +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: v_or_b32_e32 v29, v44, v21 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v26, v26, v57 +; SI-NEXT: v_or_b32_e32 v34, v61, v32 +; SI-NEXT: v_or_b32_e32 v32, s4, v29 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s56, s42, 8 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: v_or_b32_e32 v26, v26, v62 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_alignbit_b32 v17, v18, v25, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v33, 16 +; SI-NEXT: v_or_b32_e32 v33, s4, v33 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s56, s44, 8 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: s_or_b32 s14, s14, s57 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_alignbit_b32 v1, s41, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v15, 16 +; SI-NEXT: v_alignbit_b32 v13, v36, v23, 16 +; SI-NEXT: v_alignbit_b32 v21, v38, v29, 16 +; SI-NEXT: v_alignbit_b32 v29, v48, v34, 16 +; SI-NEXT: v_or_b32_e32 v34, s4, v34 +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s57, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 +; SI-NEXT: s_cbranch_execnz .LBB99_3 +; SI-NEXT: .LBB99_2: ; %cmp.true +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v61, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, 0x3000000, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v45 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: v_add_i32_e32 v48, vcc, 0x3000000, v1 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v33, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v55 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v5, v47, v5 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v1 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s5, s15, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v32, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v5, v42, v5 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: v_add_i32_e32 v38, vcc, 0x3000000, v1 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s12, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v39, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v5, v52, v5 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v1 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v37, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: v_add_i32_e32 v36, vcc, 0x3000000, v1 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s46, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s7, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s14, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_add_i32_e32 v35, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: s_add_i32 s47, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: s_add_i32 s41, s4, 0x3000000 +; SI-NEXT: v_mov_b32_e32 v0, s47 +; SI-NEXT: v_alignbit_b32 v1, s41, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s46 +; SI-NEXT: v_alignbit_b32 v5, s14, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v35, 16 +; SI-NEXT: v_alignbit_b32 v13, v36, v37, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v39, 16 +; SI-NEXT: v_alignbit_b32 v21, v38, v32, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v33, 16 +; SI-NEXT: v_alignbit_b32 v29, v48, v34, 16 +; SI-NEXT: s_lshr_b32 s56, s41, 16 +; SI-NEXT: s_lshr_b32 s57, s14, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v48 +; SI-NEXT: .LBB99_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, s47 +; SI-NEXT: v_mov_b32_e32 v2, s41 +; SI-NEXT: v_mov_b32_e32 v3, s56 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, v35 +; SI-NEXT: v_mov_b32_e32 v12, v37 +; SI-NEXT: v_mov_b32_e32 v14, v36 +; SI-NEXT: v_mov_b32_e32 v16, v39 +; SI-NEXT: v_mov_b32_e32 v20, v32 +; SI-NEXT: v_mov_b32_e32 v22, v38 +; SI-NEXT: v_mov_b32_e32 v24, v33 +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB99_4: +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB99_2 +; +; VI-LABEL: bitcast_v64i8_to_v32i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v44, 8, v27 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v32 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v34 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB99_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v10, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v26, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v13 +; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v62 +; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v53, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_mov_b32_e32 v19, v57 +; VI-NEXT: v_mov_b32_e32 v57, v15 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: v_mov_b32_e32 v42, v17 +; VI-NEXT: v_mov_b32_e32 v35, v16 +; VI-NEXT: v_mov_b32_e32 v41, v21 +; VI-NEXT: v_mov_b32_e32 v38, v20 +; VI-NEXT: v_mov_b32_e32 v39, v25 +; VI-NEXT: v_mov_b32_e32 v34, v24 +; VI-NEXT: v_mov_b32_e32 v32, v29 +; VI-NEXT: v_mov_b32_e32 v31, v28 +; VI-NEXT: v_mov_b32_e32 v23, v50 +; VI-NEXT: v_mov_b32_e32 v50, v30 +; VI-NEXT: v_mov_b32_e32 v27, v49 +; VI-NEXT: v_mov_b32_e32 v49, v55 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v59, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v58 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB99_3 +; VI-NEXT: .LBB99_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v19 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v37 +; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v50 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v10, 24, v32 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v36 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v56 +; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v43 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v38 +; VI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v42 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v34 +; VI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v41 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v31 +; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; VI-NEXT: v_lshlrev_b32_e32 v9, 24, v39 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s10, s29, 8 +; VI-NEXT: s_and_b32 s11, s28, 0xff +; VI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; VI-NEXT: v_lshlrev_b32_e32 v11, 24, v52 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s25, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s24, 0xff +; VI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; VI-NEXT: v_lshlrev_b32_e32 v12, 24, v61 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v48 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s11, s20, 0xff +; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v51 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: v_or_b32_sdwa v38, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s8, s11 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: s_or_b32 s7, s7, s11 +; VI-NEXT: s_and_b32 s13, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 24 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v53 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v62 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v39, v49, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_and_b32 s12, s22, 0xff +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v19 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 +; VI-NEXT: s_and_b32 s11, s26, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s8, 0xffff +; VI-NEXT: s_lshl_b32 s8, s12, 16 +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: v_add_u32_e32 v48, vcc, 3, v59 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s8, s11, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v63 +; VI-NEXT: v_or_b32_sdwa v48, v54, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s9, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s10, 0xffff +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v37, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v19, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v30 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v26, v19, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v58 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; VI-NEXT: v_or_b32_e32 v3, v3, v13 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v22 +; VI-NEXT: v_or_b32_e32 v3, s7, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v23, v23, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v27, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v23 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x300, v27 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_and_b32_e32 v33, 0xff, v34 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; VI-NEXT: v_or_b32_e32 v4, v4, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; VI-NEXT: v_or_b32_e32 v5, v5, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; VI-NEXT: v_or_b32_e32 v6, v6, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 +; VI-NEXT: v_or_b32_e32 v7, v7, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; VI-NEXT: v_or_b32_e32 v8, v8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; VI-NEXT: v_or_b32_e32 v9, v9, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 +; VI-NEXT: v_or_b32_e32 v10, v10, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 +; VI-NEXT: v_or_b32_e32 v11, v11, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; VI-NEXT: v_or_b32_e32 v2, v2, v13 +; VI-NEXT: v_or_b32_sdwa v2, v2, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v1, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 +; VI-NEXT: v_or_b32_sdwa v5, v5, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v12, v12, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_or_b32_sdwa v0, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v19 +; VI-NEXT: v_or_b32_sdwa v7, v7, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: .LBB99_3: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB99_4: +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v23, v50 +; VI-NEXT: v_mov_b32_e32 v19, v57 +; VI-NEXT: v_mov_b32_e32 v27, v49 +; VI-NEXT: v_mov_b32_e32 v36, v13 +; VI-NEXT: v_mov_b32_e32 v42, v17 +; VI-NEXT: v_mov_b32_e32 v41, v21 +; VI-NEXT: v_mov_b32_e32 v39, v25 +; VI-NEXT: v_mov_b32_e32 v32, v29 +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_mov_b32_e32 v35, v16 +; VI-NEXT: v_mov_b32_e32 v38, v20 +; VI-NEXT: v_mov_b32_e32 v34, v24 +; VI-NEXT: v_mov_b32_e32 v31, v28 +; VI-NEXT: v_mov_b32_e32 v50, v30 +; VI-NEXT: v_mov_b32_e32 v49, v55 +; VI-NEXT: v_mov_b32_e32 v57, v15 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB99_2 +; +; GFX9-LABEL: bitcast_v64i8_to_v32i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v34, v30 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_mov_b32_e32 v51, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v51 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v32 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v38 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v31 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v33 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v35 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v49 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v16, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v37, v24 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: v_mov_b32_e32 v55, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v19, v13 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: v_mov_b32_e32 v42, v15 +; GFX9-NEXT: v_mov_b32_e32 v27, v25 +; GFX9-NEXT: v_mov_b32_e32 v30, v18 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: v_mov_b32_e32 v39, v26 +; GFX9-NEXT: v_mov_b32_e32 v35, v28 +; GFX9-NEXT: v_mov_b32_e32 v54, v31 +; GFX9-NEXT: v_mov_b32_e32 v31, v51 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v18, v22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v20, v24 +; GFX9-NEXT: s_cbranch_execnz .LBB99_3 +; GFX9-NEXT: .LBB99_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v13, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v15, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s10, s11, s10 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v16, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v21, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v36, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v22 +; GFX9-NEXT: .LBB99_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB99_4: +; GFX9-NEXT: v_mov_b32_e32 v30, v18 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v54, v31 +; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_mov_b32_e32 v39, v26 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, v22 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v35, v28 +; GFX9-NEXT: v_mov_b32_e32 v37, v24 +; GFX9-NEXT: v_mov_b32_e32 v31, v51 +; GFX9-NEXT: v_mov_b32_e32 v27, v25 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v42, v15 +; GFX9-NEXT: v_mov_b32_e32 v19, v13 +; GFX9-NEXT: v_mov_b32_e32 v55, v11 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB99_2 +; +; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, v96, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v86, 16, v87 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v96, 16, v97 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB99_3 +; GFX11-TRUE16-NEXT: .LBB99_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v28, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v20, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v18, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v23, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v25, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v37, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v2, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: .LBB99_3: ; %end +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB99_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB99_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v32i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v35 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v39 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v38 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v31 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v5, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v6, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v24 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB99_3 +; GFX11-FAKE16-NEXT: .LBB99_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v68 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v64 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v16 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v71, v5 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v66, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v28 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v29, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v27, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v25, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v21, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v55, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v54, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v17, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v53, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v33 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v52, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v51, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v50, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v49, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v35 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v80 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v83, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v48, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v39, v4 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v85, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB99_3: ; %end +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB99_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB99_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f16_to_v32bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v31 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB50_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: .LBB50_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB50_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v55 -; GCN-NEXT: .LBB50_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f16_to_v32bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v24 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v25 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v26 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v27 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v28 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v29 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v30 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: .LBB100_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v24 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: .LBB100_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f16_to_v32bf16: ; VI: ; %bb.0: @@ -37021,7 +74004,7 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: s_cbranch_execz .LBB100_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v15 @@ -37072,68 +74055,535 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v19, v2 ; VI-NEXT: v_or_b32_e32 v1, v18, v1 ; VI-NEXT: v_or_b32_e32 v0, v16, v0 -; VI-NEXT: .LBB50_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB100_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f16_to_v32bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB100_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB100_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f16_to_v32bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB100_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB100_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + +define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f16_to_v32bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v32, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v13 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 +; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: .LBB101_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: .LBB101_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB101_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB101_2 +; +; VI-LABEL: bitcast_v32f16_to_v32bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB101_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB101_3 +; VI-NEXT: .LBB101_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v17, 0x200 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v15 +; VI-NEXT: v_add_f16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v19, v15 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v14 +; VI-NEXT: v_add_f16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v19, v14 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v13 +; VI-NEXT: v_add_f16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v19, v13 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v12 +; VI-NEXT: v_add_f16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v19, v12 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v11 +; VI-NEXT: v_add_f16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v19, v11 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v10 +; VI-NEXT: v_add_f16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v19, v10 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v9 +; VI-NEXT: v_add_f16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v19, v9 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v19, v8 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v19, v7 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v19, v6 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v19, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v19, v4 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v17, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_or_b32_e32 v2, v19, v2 +; VI-NEXT: v_or_b32_e32 v1, v18, v1 +; VI-NEXT: v_or_b32_e32 v0, v16, v0 +; VI-NEXT: .LBB101_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB101_4: +; VI-NEXT: s_branch .LBB101_2 ; -; GFX9-LABEL: bitcast_v32f16_to_v32bf16: +; GFX9-LABEL: bitcast_v32f16_to_v32bf16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB50_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: s_movk_i32 s6, 0x200 -; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB50_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB101_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB101_3 +; GFX9-NEXT: .LBB101_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB101_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB101_4: +; GFX9-NEXT: s_branch .LBB101_2 ; -; GFX11-LABEL: bitcast_v32f16_to_v32bf16: +; GFX11-LABEL: bitcast_v32f16_to_v32bf16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB50_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB50_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB101_4 +; GFX11-NEXT: .LBB101_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB101_3: +; GFX11-NEXT: s_branch .LBB101_2 +; GFX11-NEXT: .LBB101_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -37153,346 +74603,347 @@ end: } define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v32bf16_to_v32f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v63 -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v31 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB51_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v59 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v61 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: .LBB51_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB51_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v59 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v57 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v56 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v46 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v45 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v44 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v43 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v42 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v41 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v48 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: .LBB51_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32bf16_to_v32f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v30 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v63 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB102_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: .LBB102_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB102_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: .LBB102_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v32f16: ; VI: ; %bb.0: @@ -37501,7 +74952,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB51_2 +; VI-NEXT: s_cbranch_execz .LBB102_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -37792,7 +75243,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 ; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 -; VI-NEXT: .LBB51_2: ; %end +; VI-NEXT: .LBB102_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -37803,7 +75254,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB51_2 +; GFX9-NEXT: s_cbranch_execz .LBB102_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -38047,7 +75498,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v2, v18, s6 ; GFX9-NEXT: v_perm_b32 v1, v1, v17, s6 ; GFX9-NEXT: v_perm_b32 v0, v0, v16, s6 -; GFX9-NEXT: .LBB51_2: ; %end +; GFX9-NEXT: .LBB102_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -38059,7 +75510,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v6 @@ -38330,7 +75781,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v30 ; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v29 -; GFX11-TRUE16-NEXT: .LBB51_2: ; %end +; GFX11-TRUE16-NEXT: .LBB102_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -38342,7 +75793,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v0 @@ -38592,7 +76043,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v29, v32, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v27, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB51_2: ; %end +; GFX11-FAKE16-NEXT: .LBB102_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -38612,761 +76063,2489 @@ end: ret <32 x half> %phi } +define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32bf16_to_v32f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s17 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v10 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v13 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v15 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v40, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s29 +; SI-NEXT: s_cbranch_scc0 .LBB103_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: s_cbranch_execnz .LBB103_3 +; SI-NEXT: .LBB103_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: .LBB103_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB103_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB103_2 +; +; VI-LABEL: bitcast_v32bf16_to_v32f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_writelane_b32 v20, s31, 1 +; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s31, v1 +; VI-NEXT: s_cbranch_scc0 .LBB103_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB103_4 +; VI-NEXT: .LBB103_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_lshl_b32 s5, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s5, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s5, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_lshl_b32 s5, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s5, v1 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_and_b32 s5, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s5, v1 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc +; VI-NEXT: v_add_f32_e32 v16, s4, v1 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16 +; VI-NEXT: v_add_f32_e32 v16, s4, v1 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_add_f32_e32 v17, s4, v1 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 +; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; VI-NEXT: s_branch .LBB103_5 +; VI-NEXT: .LBB103_3: +; VI-NEXT: s_branch .LBB103_2 +; VI-NEXT: .LBB103_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: .LBB103_5: ; %end +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v32f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s31, v1 +; GFX9-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB103_4 +; GFX9-NEXT: .LBB103_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: s_and_b32 s5, s30, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v1, s5, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s5, s30, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s5, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b32 s5, s31, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s5, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s5, s31, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s5, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v3, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 +; GFX9-NEXT: s_branch .LBB103_5 +; GFX9-NEXT: .LBB103_3: +; GFX9-NEXT: s_branch .LBB103_2 +; GFX9-NEXT: .LBB103_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: .LBB103_5: ; %end +; GFX9-NEXT: v_readlane_b32 s31, v20, 1 +; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v32f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB103_4 +; GFX11-TRUE16-NEXT: .LBB103_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s12, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s13, 16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s14, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s27, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v33, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v9, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_add_nc_u32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v4 :: v_dual_add_nc_u32 v3, v6, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s16, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_nc_u32 v5, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v5, v7 :: v_dual_add_nc_u32 v5, v8, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v9 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v6, v10 :: v_dual_add_nc_u32 v9, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc_lo +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v12, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v13, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v9, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v12, v13 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v10, v10, v14 :: v_dual_add_nc_u32 v11, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v21, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v13 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v15, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v14, v15 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v13, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v14, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, v26, v14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, v27, v12 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v27, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s25, 0xffff0000 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v26, v27 +; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v13 +; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v26, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v14, v14, v29 :: v_dual_add_nc_u32 v15, 0x7fff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, v30, v26 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v27, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v15, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v26 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v14, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s26, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v35, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v30, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v14, v32, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v34, v29 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v36, v33 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, v26, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v32, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v27, v27, v37 :: v_dual_add_nc_u32 v26, 0x7fff, v26 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v29, v32, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v26, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v30, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v28, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v31, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v32, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v23, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v22, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v18, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v17, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v16, 16, v23 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB103_3: +; GFX11-TRUE16-NEXT: s_branch .LBB103_2 +; GFX11-TRUE16-NEXT: .LBB103_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v32f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB103_4 +; GFX11-FAKE16-NEXT: .LBB103_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s12, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s13, 16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s14, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s27, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v33, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v9, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_add_nc_u32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s15, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v3, v4 :: v_dual_add_nc_u32 v3, v6, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v3, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s16, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s18, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v11, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v13, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s20, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v9, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v13 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s21, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v14 :: v_dual_add_nc_u32 v11, 0x7fff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v13 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s22, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v14, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s23, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, v26, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, v27, v12 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s24, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v27, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s25, 0xffff0000 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v26, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s25, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v13 +; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v29 :: v_dual_add_nc_u32 v15, 0x7fff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v30, v26 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v27, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v15, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v14, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v15, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s26, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v35, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v30, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v32, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v34, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v33 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, v26, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v37 :: v_dual_add_nc_u32 v26, 0x7fff, v26 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v29, v32, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v30, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v28, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v31, 16, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v26, 16, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v32, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v5, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v24, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v23, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v21, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v18, 16, v19 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v17, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v16, 16, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v23 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB103_3: +; GFX11-FAKE16-NEXT: s_branch .LBB103_2 +; GFX11-FAKE16-NEXT: .LBB103_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f16_to_v64i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v39 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB52_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v1 -; GCN-NEXT: v_bfe_u32 v33, v31, 8, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v33, v8, 8, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v33, v6, 8, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v33, v5, 8, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v33, v4, 8, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v33, v3, 8, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v33, v2, 8, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v57, v34, v10 -; GCN-NEXT: v_or_b32_e32 v56, v32, v11 -; GCN-NEXT: v_or_b32_e32 v47, v35, v12 -; GCN-NEXT: v_or_b32_e32 v45, v7, v14 -; GCN-NEXT: v_or_b32_e32 v41, v36, v20 -; GCN-NEXT: v_or_b32_e32 v53, v9, v23 -; GCN-NEXT: v_or_b32_e32 v48, v37, v25 -; GCN-NEXT: v_or_b32_e32 v39, v13, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v16, v29 -; GCN-NEXT: v_or_b32_e32 v30, v18, v30 -; GCN-NEXT: v_or_b32_e32 v25, v15, v38 -; GCN-NEXT: v_or_b32_e32 v23, v22, v49 -; GCN-NEXT: v_or_b32_e32 v20, v17, v24 -; GCN-NEXT: v_or_b32_e32 v14, v27, v50 -; GCN-NEXT: v_or_b32_e32 v12, v21, v28 -; GCN-NEXT: v_or_b32_e32 v10, v19, v51 -; GCN-NEXT: v_alignbit_b32 v42, v56, v57, 24 -; GCN-NEXT: v_alignbit_b32 v43, v56, v57, 16 -; GCN-NEXT: v_alignbit_b32 v61, v56, v57, 8 -; GCN-NEXT: v_alignbit_b32 v55, v45, v47, 24 -; GCN-NEXT: v_alignbit_b32 v40, v45, v47, 16 -; GCN-NEXT: v_alignbit_b32 v60, v45, v47, 8 -; GCN-NEXT: v_alignbit_b32 v7, v53, v41, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v53, v41, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v59, v53, v41, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v39, v48, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v39, v48, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v46, v39, v48, 8 -; GCN-NEXT: v_alignbit_b32 v29, v30, v33, 24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v30, v33, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v26, v30, v33, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v11, 8, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 8, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v23 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v14 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v10 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v7, v1, 8, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: .LBB52_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB52_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v31 -; GCN-NEXT: v_bfe_u32 v12, v31, 8, 8 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v12, v8, 8, 8 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v12, v6, 8, 8 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v12, v5, 8, 8 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v12, v4, 8, 8 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v12, v3, 8, 8 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v12, v2, 8, 8 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v11, v10 -; GCN-NEXT: v_or_b32_e32 v10, v33, v30 -; GCN-NEXT: v_or_b32_e32 v20, v14, v34 -; GCN-NEXT: v_or_b32_e32 v14, v17, v35 -; GCN-NEXT: v_or_b32_e32 v25, v15, v19 -; GCN-NEXT: v_or_b32_e32 v23, v21, v36 -; GCN-NEXT: v_or_b32_e32 v33, v16, v37 -; GCN-NEXT: v_or_b32_e32 v30, v18, v38 -; GCN-NEXT: v_or_b32_e32 v48, v24, v22 -; GCN-NEXT: v_or_b32_e32 v39, v13, v39 -; GCN-NEXT: v_or_b32_e32 v41, v26, v49 -; GCN-NEXT: v_or_b32_e32 v53, v9, v50 -; GCN-NEXT: v_or_b32_e32 v47, v27, v51 -; GCN-NEXT: v_or_b32_e32 v45, v7, v52 -; GCN-NEXT: v_or_b32_e32 v57, v29, v28 -; GCN-NEXT: v_or_b32_e32 v56, v32, v54 -; GCN-NEXT: v_alignbit_b32 v42, v56, v57, 24 -; GCN-NEXT: v_alignbit_b32 v43, v56, v57, 16 -; GCN-NEXT: v_alignbit_b32 v61, v56, v57, 8 -; GCN-NEXT: v_alignbit_b32 v55, v45, v47, 24 -; GCN-NEXT: v_alignbit_b32 v40, v45, v47, 16 -; GCN-NEXT: v_alignbit_b32 v60, v45, v47, 8 -; GCN-NEXT: v_alignbit_b32 v7, v53, v41, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v53, v41, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v59, v53, v41, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v39, v48, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v39, v48, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v46, v39, v48, 8 -; GCN-NEXT: v_alignbit_b32 v29, v30, v33, 24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v30, v33, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v26, v30, v33, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v11, 8, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 8, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v23 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v14 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v10 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v7, v1, 8, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: .LBB52_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v61 -; GCN-NEXT: v_or_b32_e32 v13, v7, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v11 -; GCN-NEXT: v_or_b32_e32 v15, v7, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v60 -; GCN-NEXT: v_or_b32_e32 v16, v7, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v63 -; GCN-NEXT: v_or_b32_e32 v17, v7, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v59 -; GCN-NEXT: v_or_b32_e32 v19, v7, v9 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v42 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v31 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v62 -; GCN-NEXT: v_or_b32_e32 v21, v7, v9 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v55 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v46 -; GCN-NEXT: v_or_b32_e32 v22, v9, v11 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v58 -; GCN-NEXT: v_or_b32_e32 v24, v8, v11 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 12, v0 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v11 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v26 -; GCN-NEXT: v_or_b32_e32 v26, v11, v18 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v31, 24, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v44 -; GCN-NEXT: v_or_b32_e32 v28, v6, v18 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v30, 24, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v25 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; GCN-NEXT: v_or_b32_e32 v25, v18, v25 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v23 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GCN-NEXT: v_or_b32_e32 v23, v5, v23 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v29 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: v_or_b32_e32 v29, v20, v29 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v52, 24, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GCN-NEXT: v_or_b32_e32 v14, v4, v14 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v53 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v53 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v53 -; GCN-NEXT: v_or_b32_e32 v53, v12, v53 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v56, 0xff, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v10 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GCN-NEXT: v_or_b32_e32 v41, v3, v10 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v10 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v43, 24, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_or_b32_e32 v37, v38, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v58, 0xff, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GCN-NEXT: v_or_b32_e32 v34, v34, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v59, 0xff, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v45, 24, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v35 -; GCN-NEXT: v_or_b32_e32 v35, v36, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v60, 24, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v27, v27, v39 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v31, v31, v49 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v30, v30, v51 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v33, v33, v54 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v39, v50, v42 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v49, v52, v44 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v50, v55, v46 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v51, v40, v47 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v52, v43, v56 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v38, v38, v57 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v54, v45, v58 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff, v41 -; GCN-NEXT: v_or_b32_e32 v36, v60, v36 -; GCN-NEXT: v_or_b32_e32 v13, v13, v37 -; GCN-NEXT: v_or_b32_e32 v15, v15, v34 -; GCN-NEXT: v_or_b32_e32 v16, v16, v35 -; GCN-NEXT: v_or_b32_e32 v17, v17, v48 -; GCN-NEXT: v_or_b32_e32 v19, v19, v27 -; GCN-NEXT: v_or_b32_e32 v21, v21, v31 -; GCN-NEXT: v_or_b32_e32 v22, v22, v30 -; GCN-NEXT: v_or_b32_e32 v24, v24, v33 -; GCN-NEXT: v_or_b32_e32 v26, v26, v39 -; GCN-NEXT: v_or_b32_e32 v27, v28, v49 -; GCN-NEXT: v_or_b32_e32 v25, v25, v50 -; GCN-NEXT: v_or_b32_e32 v23, v23, v51 -; GCN-NEXT: v_or_b32_e32 v28, v29, v52 -; GCN-NEXT: v_or_b32_e32 v14, v14, v38 -; GCN-NEXT: v_or_b32_e32 v29, v53, v54 -; GCN-NEXT: v_or_b32_e32 v30, v55, v36 -; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f16_to_v64i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v54 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB104_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_or_b32_e32 v54, v33, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v31 +; SI-NEXT: v_or_b32_e32 v50, v32, v7 +; SI-NEXT: v_alignbit_b32 v7, v50, v54, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v50, v54, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_or_b32_e32 v21, v36, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v20, v35, v7 +; SI-NEXT: v_alignbit_b32 v7, v20, v21, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v20, v21, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v20, v21, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_or_b32_e32 v18, v39, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_or_b32_e32 v19, v38, v7 +; SI-NEXT: v_alignbit_b32 v7, v19, v18, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v19, v18, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v19, v18, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v52 +; SI-NEXT: v_or_b32_e32 v16, v51, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_or_b32_e32 v17, v49, v7 +; SI-NEXT: v_alignbit_b32 v7, v17, v16, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v17, v16, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v40 +; SI-NEXT: v_or_b32_e32 v15, v55, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_or_b32_e32 v14, v53, v7 +; SI-NEXT: v_alignbit_b32 v7, v14, v15, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v14, v15, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v14, v15, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v42 +; SI-NEXT: v_or_b32_e32 v12, v41, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_or_b32_e32 v13, v22, v7 +; SI-NEXT: v_alignbit_b32 v7, v13, v12, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v13, v12, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v13, v12, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v26, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_or_b32_e32 v11, v25, v7 +; SI-NEXT: v_alignbit_b32 v7, v11, v10, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v11, v10, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v11, v10, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_or_b32_e32 v9, v29, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_alignbit_b32 v22, v7, v9, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v7, v9, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v7, v9, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v11 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v22, v1, 8, 8 +; SI-NEXT: v_alignbit_b32 v43, v50, v54, 24 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v13 +; SI-NEXT: v_bfe_u32 v23, v31, 8, 8 +; SI-NEXT: v_bfe_u32 v62, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v61, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v59, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v57, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v47, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v45, v2, 8, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: .LBB104_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB104_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v9, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v42 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v15, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v39 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v21, v21, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v8 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v54, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v50, v24, v22 +; SI-NEXT: v_alignbit_b32 v22, v50, v54, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v50, v54, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v20, v21, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v20, v21, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v20, v21, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v19, v18, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v19, v18, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v19, v18, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v17, v16, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v17, v16, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v14, v15, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v14, v15, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v14, v15, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v13, v12, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v13, v12, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v13, v12, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v7, v9, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v7, v9, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v7, v9, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v11 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; SI-NEXT: v_alignbit_b32 v43, v50, v54, 24 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v13 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v23, v31, 8, 8 +; SI-NEXT: v_bfe_u32 v62, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v61, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v59, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v57, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v47, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v45, v2, 8, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v22, v1, 8, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: .LBB104_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v43 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v63 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v60 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v62 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v8, v21, v8 +; SI-NEXT: v_or_b32_e32 v8, v20, v8 +; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v8, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_or_b32_e32 v8, v8, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_or_b32_e32 v8, v8, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v8, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v58 +; SI-NEXT: v_or_b32_e32 v8, v8, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v61 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v6, v18, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v8, v16, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v56 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v59 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v15 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v46 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v57 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v12 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v44 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v47 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f16_to_v64i8: ; VI: ; %bb.0: @@ -39441,7 +78620,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB52_2 +; VI-NEXT: s_cbranch_execz .LBB104_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -39478,9 +78657,9 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[26:27], 24, v[5:6] ; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v1 -; VI-NEXT: .LBB52_2: ; %Flow +; VI-NEXT: .LBB104_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB52_4 +; VI-NEXT: s_cbranch_execz .LBB104_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 ; VI-NEXT: v_add_f16_sdwa v51, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -39582,7 +78761,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; VI-NEXT: v_bfe_u32 v61, v38, 8, 8 ; VI-NEXT: v_bfe_u32 v54, v49, 8, 8 ; VI-NEXT: v_bfe_u32 v40, v51, 8, 8 -; VI-NEXT: .LBB52_4: ; %end +; VI-NEXT: .LBB104_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 ; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -39792,7 +78971,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB52_2 +; GFX9-NEXT: s_cbranch_execz .LBB104_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -39845,9 +79024,9 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: .LBB52_2: ; %Flow +; GFX9-NEXT: .LBB104_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB52_4 +; GFX9-NEXT: s_cbranch_execz .LBB104_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] @@ -39916,7 +79095,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB52_4: ; %end +; GFX9-NEXT: .LBB104_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 @@ -40076,7 +79255,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB104_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -40110,9 +79289,9 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB52_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB104_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB104_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] @@ -40162,7 +79341,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB52_4: ; %end +; GFX11-TRUE16-NEXT: .LBB104_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -40374,7 +79553,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB104_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -40424,9 +79603,9 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB104_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB104_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] @@ -40492,7 +79671,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB52_4: ; %end +; GFX11-FAKE16-NEXT: .LBB104_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -40662,718 +79841,3160 @@ end: ret <64 x i8> %phi } +define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f16_to_v64i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v22, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v46, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB105_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_or_b32_e32 v37, v10, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_or_b32_e32 v32, v9, v8 +; SI-NEXT: v_alignbit_b32 v8, v32, v37, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v32, v37, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v32, v37, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_or_b32_e32 v24, v12, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_or_b32_e32 v23, v11, v8 +; SI-NEXT: v_alignbit_b32 v8, v23, v24, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v23, v24, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v23, v24, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 +; SI-NEXT: v_or_b32_e32 v18, v42, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_or_b32_e32 v19, v14, v8 +; SI-NEXT: v_alignbit_b32 v8, v19, v18, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v19, v18, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v19, v18, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_or_b32_e32 v16, v25, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_or_b32_e32 v17, v28, v8 +; SI-NEXT: v_alignbit_b32 v8, v17, v16, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v17, v16, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v17, v16, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_or_b32_e32 v15, v21, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_or_b32_e32 v14, v62, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v12, v34, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_or_b32_e32 v13, v30, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: v_or_b32_e32 v10, v50, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_or_b32_e32 v11, v48, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_or_b32_e32 v9, v40, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 24 +; SI-NEXT: v_or_b32_e32 v8, v55, v8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v8, v9, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v8, v9, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v8, v9, 8 +; SI-NEXT: v_alignbit_b32 v57, v14, v15, 24 +; SI-NEXT: v_alignbit_b32 v58, v14, v15, 16 +; SI-NEXT: v_alignbit_b32 v61, v14, v15, 8 +; SI-NEXT: v_alignbit_b32 v44, v13, v12, 24 +; SI-NEXT: v_alignbit_b32 v47, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v56, v13, v12, 8 +; SI-NEXT: v_alignbit_b32 v43, v11, v10, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v32 +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v23 +; SI-NEXT: v_lshrrev_b32_e32 v36, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v13 +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v8 +; SI-NEXT: v_bfe_u32 v54, v7, 8, 8 +; SI-NEXT: v_bfe_u32 v51, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v49, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v38, v20, 8, 8 +; SI-NEXT: v_bfe_u32 v33, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v29, v3, 8, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v22, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v60, v1, 8, 8 +; SI-NEXT: s_cbranch_execnz .LBB105_3 +; SI-NEXT: .LBB105_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v18, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v9, v9, v8 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v27 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v62 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v15, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v21, v18 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_alignbit_b32 v57, v14, v15, 24 +; SI-NEXT: v_alignbit_b32 v58, v14, v15, 16 +; SI-NEXT: v_alignbit_b32 v61, v14, v15, 8 +; SI-NEXT: v_alignbit_b32 v44, v13, v12, 24 +; SI-NEXT: v_alignbit_b32 v47, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v56, v13, v12, 8 +; SI-NEXT: v_alignbit_b32 v43, v11, v10, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v13 +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v8 +; SI-NEXT: v_bfe_u32 v54, v7, 8, 8 +; SI-NEXT: v_bfe_u32 v51, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v49, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v38, v20, 8, 8 +; SI-NEXT: v_bfe_u32 v33, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v29, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v60, v1, 8, 8 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshrrev_b32_e32 v36, 8, v19 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v24, v22, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v37, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_or_b32_e32 v32, v25, v21 +; SI-NEXT: v_alignbit_b32 v21, v32, v37, 24 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v32, v37, 16 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v32, v37, 8 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v23, v24, 24 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v23, v24, 16 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v23, v24, 8 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v19, v18, 24 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v19, v18, 16 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v19, v18, 8 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 24 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 8 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 24 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v8, v9, 24 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v8, v9, 16 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v8, v9, 8 +; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v32 +; SI-NEXT: v_bfe_u32 v22, v2, 8, 8 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: .LBB105_3: ; %end +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v52 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v54 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v7, v25, v7 +; SI-NEXT: v_or_b32_e32 v7, v21, v7 +; SI-NEXT: v_add_i32_e32 v21, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v7, v21, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; SI-NEXT: v_or_b32_e32 v7, v7, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v24, v21 +; SI-NEXT: v_or_b32_e32 v7, v7, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v7, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v51 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v6, v21, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v18, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v36 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v49 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v61 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v57 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v56 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v63 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v29 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v59 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v45 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v60 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB105_4: +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_branch .LBB105_2 +; +; VI-LABEL: bitcast_v32f16_to_v64i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v63, s30, 0 +; VI-NEXT: v_writelane_b32 v63, s31, 1 +; VI-NEXT: v_writelane_b32 v63, s34, 2 +; VI-NEXT: v_writelane_b32 v63, s35, 3 +; VI-NEXT: v_writelane_b32 v63, s36, 4 +; VI-NEXT: v_writelane_b32 v63, s37, 5 +; VI-NEXT: v_writelane_b32 v63, s38, 6 +; VI-NEXT: v_writelane_b32 v63, s39, 7 +; VI-NEXT: v_writelane_b32 v63, s48, 8 +; VI-NEXT: v_writelane_b32 v63, s49, 9 +; VI-NEXT: v_writelane_b32 v63, s50, 10 +; VI-NEXT: v_writelane_b32 v63, s51, 11 +; VI-NEXT: v_writelane_b32 v63, s52, 12 +; VI-NEXT: v_writelane_b32 v63, s53, 13 +; VI-NEXT: v_writelane_b32 v63, s54, 14 +; VI-NEXT: v_writelane_b32 v63, s55, 15 +; VI-NEXT: v_writelane_b32 v63, s64, 16 +; VI-NEXT: v_writelane_b32 v63, s65, 17 +; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB105_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s75, s5, 24 +; VI-NEXT: s_lshr_b32 s36, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s37, s4, 16 +; VI-NEXT: s_lshr_b32 s56, s4, 8 +; VI-NEXT: s_lshr_b32 s77, s29, 24 +; VI-NEXT: s_lshr_b32 s38, s29, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 8 +; VI-NEXT: s_lshr_b32 s39, s28, 16 +; VI-NEXT: s_lshr_b32 s57, s28, 8 +; VI-NEXT: s_lshr_b32 s79, s27, 24 +; VI-NEXT: s_lshr_b32 s48, s27, 16 +; VI-NEXT: s_lshr_b32 s74, s27, 8 +; VI-NEXT: s_lshr_b32 s49, s26, 16 +; VI-NEXT: s_lshr_b32 s59, s26, 8 +; VI-NEXT: s_lshr_b32 s89, s25, 24 +; VI-NEXT: s_lshr_b32 s50, s25, 16 +; VI-NEXT: s_lshr_b32 s76, s25, 8 +; VI-NEXT: s_lshr_b32 s51, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s24, 8 +; VI-NEXT: s_lshr_b32 s91, s23, 24 +; VI-NEXT: s_lshr_b32 s52, s23, 16 +; VI-NEXT: s_lshr_b32 s78, s23, 8 +; VI-NEXT: s_lshr_b32 s53, s22, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 8 +; VI-NEXT: s_lshr_b32 s31, s21, 24 +; VI-NEXT: s_lshr_b32 s54, s21, 16 +; VI-NEXT: s_lshr_b32 s88, s21, 8 +; VI-NEXT: s_lshr_b32 s55, s20, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 8 +; VI-NEXT: s_lshr_b32 s34, s19, 24 +; VI-NEXT: s_lshr_b32 s64, s19, 16 +; VI-NEXT: s_lshr_b32 s90, s19, 8 +; VI-NEXT: s_lshr_b32 s65, s18, 16 +; VI-NEXT: s_lshr_b32 s72, s18, 8 +; VI-NEXT: s_lshr_b32 s35, s17, 24 +; VI-NEXT: s_lshr_b32 s66, s17, 16 +; VI-NEXT: s_lshr_b32 s30, s17, 8 +; VI-NEXT: s_lshr_b32 s67, s16, 16 +; VI-NEXT: s_lshr_b32 s73, s16, 8 +; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB105_4 +; VI-NEXT: .LBB105_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s6, s17, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x200 +; VI-NEXT: v_add_f16_e32 v12, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; VI-NEXT: v_add_f16_e32 v27, s17, v1 +; VI-NEXT: v_add_f16_e32 v19, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s19, 16 +; VI-NEXT: v_or_b32_e32 v10, v27, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; VI-NEXT: v_add_f16_e32 v35, s16, v1 +; VI-NEXT: v_add_f16_e32 v13, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s18, 16 +; VI-NEXT: v_or_b32_e32 v9, v35, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; VI-NEXT: v_add_f16_e32 v28, s19, v1 +; VI-NEXT: v_add_f16_e32 v20, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s21, 16 +; VI-NEXT: v_or_b32_e32 v62, v28, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; VI-NEXT: v_add_f16_e32 v36, s18, v1 +; VI-NEXT: v_add_f16_e32 v14, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s20, 16 +; VI-NEXT: v_or_b32_e32 v61, v36, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; VI-NEXT: v_add_f16_e32 v29, s21, v1 +; VI-NEXT: v_add_f16_e32 v21, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v29, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; VI-NEXT: v_add_f16_e32 v37, s20, v1 +; VI-NEXT: v_add_f16_e32 v15, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v37, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; VI-NEXT: v_add_f16_e32 v30, s23, v1 +; VI-NEXT: v_add_f16_e32 v22, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s25, 16 +; VI-NEXT: v_or_b32_e32 v47, v30, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; VI-NEXT: v_add_f16_e32 v38, s22, v1 +; VI-NEXT: v_add_f16_e32 v16, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s24, 16 +; VI-NEXT: v_or_b32_e32 v46, v38, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; VI-NEXT: v_add_f16_e32 v31, s25, v1 +; VI-NEXT: v_add_f16_e32 v23, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s27, 16 +; VI-NEXT: v_or_b32_e32 v6, v31, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; VI-NEXT: v_add_f16_e32 v39, s24, v1 +; VI-NEXT: v_add_f16_e32 v17, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s26, 16 +; VI-NEXT: v_or_b32_e32 v5, v39, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; VI-NEXT: v_add_f16_e32 v32, s27, v1 +; VI-NEXT: v_add_f16_e32 v24, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: v_or_b32_e32 v43, v32, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; VI-NEXT: v_add_f16_e32 v48, s26, v1 +; VI-NEXT: v_add_f16_e32 v18, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s28, 16 +; VI-NEXT: v_or_b32_e32 v42, v48, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_add_f16_e32 v33, s29, v1 +; VI-NEXT: v_add_f16_e32 v25, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s5, 16 +; VI-NEXT: v_or_b32_e32 v55, v33, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; VI-NEXT: v_add_f16_e32 v49, s28, v1 +; VI-NEXT: v_add_f16_e32 v11, s6, v1 +; VI-NEXT: v_add_f16_e32 v34, s5, v1 +; VI-NEXT: s_lshr_b32 s5, s4, 16 +; VI-NEXT: v_or_b32_e32 v54, v49, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; VI-NEXT: v_add_f16_e32 v26, s5, v1 +; VI-NEXT: v_or_b32_e32 v52, v34, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; VI-NEXT: v_add_f16_e32 v50, s4, v1 +; VI-NEXT: v_or_b32_e32 v51, v50, v2 +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[51:52] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[42:43] +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[54:55] +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v42 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v6 +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[46:47] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7 +; VI-NEXT: v_lshrrev_b64 v[6:7], 24, v[7:8] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v52 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v47 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v8 +; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[61:62] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v9 +; VI-NEXT: v_lshrrev_b64 v[8:9], 24, v[9:10] +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v51 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v55 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v43 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v46 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v62 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v61 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v10 +; VI-NEXT: v_bfe_u32 v9, v11, 8, 8 +; VI-NEXT: v_bfe_u32 v10, v18, 8, 8 +; VI-NEXT: v_bfe_u32 v40, v17, 8, 8 +; VI-NEXT: v_bfe_u32 v43, v16, 8, 8 +; VI-NEXT: v_bfe_u32 v46, v15, 8, 8 +; VI-NEXT: v_bfe_u32 v57, v14, 8, 8 +; VI-NEXT: v_bfe_u32 v59, v13, 8, 8 +; VI-NEXT: v_bfe_u32 v62, v12, 8, 8 +; VI-NEXT: s_branch .LBB105_5 +; VI-NEXT: .LBB105_3: +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: s_branch .LBB105_2 +; VI-NEXT: .LBB105_4: +; VI-NEXT: v_mov_b32_e32 v1, s58 +; VI-NEXT: v_mov_b32_e32 v53, s56 +; VI-NEXT: v_mov_b32_e32 v52, s42 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v52, s44 +; VI-NEXT: v_mov_b32_e32 v19, s67 +; VI-NEXT: v_mov_b32_e32 v12, s66 +; VI-NEXT: v_mov_b32_e32 v20, s65 +; VI-NEXT: v_mov_b32_e32 v13, s64 +; VI-NEXT: v_mov_b32_e32 v21, s55 +; VI-NEXT: v_mov_b32_e32 v14, s54 +; VI-NEXT: v_mov_b32_e32 v22, s53 +; VI-NEXT: v_mov_b32_e32 v15, s52 +; VI-NEXT: v_mov_b32_e32 v23, s51 +; VI-NEXT: v_mov_b32_e32 v16, s50 +; VI-NEXT: v_mov_b32_e32 v24, s49 +; VI-NEXT: v_mov_b32_e32 v17, s48 +; VI-NEXT: v_mov_b32_e32 v25, s39 +; VI-NEXT: v_mov_b32_e32 v18, s38 +; VI-NEXT: v_mov_b32_e32 v26, s37 +; VI-NEXT: v_mov_b32_e32 v11, s36 +; VI-NEXT: v_mov_b32_e32 v35, s16 +; VI-NEXT: v_mov_b32_e32 v27, s17 +; VI-NEXT: v_mov_b32_e32 v36, s18 +; VI-NEXT: v_mov_b32_e32 v28, s19 +; VI-NEXT: v_mov_b32_e32 v37, s20 +; VI-NEXT: v_mov_b32_e32 v29, s21 +; VI-NEXT: v_mov_b32_e32 v38, s22 +; VI-NEXT: v_mov_b32_e32 v30, s23 +; VI-NEXT: v_mov_b32_e32 v39, s24 +; VI-NEXT: v_mov_b32_e32 v31, s25 +; VI-NEXT: v_mov_b32_e32 v48, s26 +; VI-NEXT: v_mov_b32_e32 v32, s27 +; VI-NEXT: v_mov_b32_e32 v49, s28 +; VI-NEXT: v_mov_b32_e32 v33, s29 +; VI-NEXT: v_mov_b32_e32 v50, s4 +; VI-NEXT: v_mov_b32_e32 v34, s5 +; VI-NEXT: v_mov_b32_e32 v62, s35 +; VI-NEXT: v_mov_b32_e32 v59, s34 +; VI-NEXT: v_mov_b32_e32 v57, s31 +; VI-NEXT: v_mov_b32_e32 v46, s91 +; VI-NEXT: v_mov_b32_e32 v43, s89 +; VI-NEXT: v_mov_b32_e32 v40, s79 +; VI-NEXT: v_mov_b32_e32 v10, s77 +; VI-NEXT: v_mov_b32_e32 v61, s30 +; VI-NEXT: v_mov_b32_e32 v58, s90 +; VI-NEXT: v_mov_b32_e32 v47, s88 +; VI-NEXT: v_mov_b32_e32 v45, s78 +; VI-NEXT: v_mov_b32_e32 v42, s76 +; VI-NEXT: v_mov_b32_e32 v55, s74 +; VI-NEXT: v_mov_b32_e32 v54, s57 +; VI-NEXT: v_mov_b32_e32 v41, s59 +; VI-NEXT: v_mov_b32_e32 v44, s60 +; VI-NEXT: v_mov_b32_e32 v56, s61 +; VI-NEXT: v_mov_b32_e32 v60, s63 +; VI-NEXT: v_mov_b32_e32 v51, s72 +; VI-NEXT: v_mov_b32_e32 v1, s73 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: v_mov_b32_e32 v7, s8 +; VI-NEXT: v_mov_b32_e32 v6, s10 +; VI-NEXT: v_mov_b32_e32 v5, s12 +; VI-NEXT: v_mov_b32_e32 v4, s14 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_mov_b32_e32 v9, s75 +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v52, s62 +; VI-NEXT: .LBB105_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v62 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v12, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v20, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v59 +; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v13, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v21, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v6, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v47 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v57 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v6, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v56 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v46 +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v15, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v23, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v43 +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v40 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_readlane_b32 s67, v63, 19 +; VI-NEXT: v_readlane_b32 s66, v63, 18 +; VI-NEXT: v_readlane_b32 s65, v63, 17 +; VI-NEXT: v_readlane_b32 s64, v63, 16 +; VI-NEXT: v_readlane_b32 s55, v63, 15 +; VI-NEXT: v_readlane_b32 s54, v63, 14 +; VI-NEXT: v_readlane_b32 s53, v63, 13 +; VI-NEXT: v_readlane_b32 s52, v63, 12 +; VI-NEXT: v_readlane_b32 s51, v63, 11 +; VI-NEXT: v_readlane_b32 s50, v63, 10 +; VI-NEXT: v_readlane_b32 s49, v63, 9 +; VI-NEXT: v_readlane_b32 s48, v63, 8 +; VI-NEXT: v_readlane_b32 s39, v63, 7 +; VI-NEXT: v_readlane_b32 s38, v63, 6 +; VI-NEXT: v_readlane_b32 s37, v63, 5 +; VI-NEXT: v_readlane_b32 s36, v63, 4 +; VI-NEXT: v_readlane_b32 s35, v63, 3 +; VI-NEXT: v_readlane_b32 s34, v63, 2 +; VI-NEXT: v_readlane_b32 s31, v63, 1 +; VI-NEXT: v_readlane_b32 s30, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v9 +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f16_to_v64i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v63, s30, 0 +; GFX9-NEXT: v_writelane_b32 v63, s31, 1 +; GFX9-NEXT: v_writelane_b32 v63, s34, 2 +; GFX9-NEXT: v_writelane_b32 v63, s35, 3 +; GFX9-NEXT: v_writelane_b32 v63, s36, 4 +; GFX9-NEXT: v_writelane_b32 v63, s37, 5 +; GFX9-NEXT: v_writelane_b32 v63, s38, 6 +; GFX9-NEXT: v_writelane_b32 v63, s39, 7 +; GFX9-NEXT: v_writelane_b32 v63, s48, 8 +; GFX9-NEXT: v_writelane_b32 v63, s49, 9 +; GFX9-NEXT: v_writelane_b32 v63, s50, 10 +; GFX9-NEXT: v_writelane_b32 v63, s51, 11 +; GFX9-NEXT: v_writelane_b32 v63, s52, 12 +; GFX9-NEXT: v_writelane_b32 v63, s53, 13 +; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s59, s5, 8 +; GFX9-NEXT: s_lshr_b32 s58, s4, 16 +; GFX9-NEXT: s_lshr_b32 s60, s4, 8 +; GFX9-NEXT: s_lshr_b32 s61, s29, 24 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s72, s29, 8 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s73, s28, 8 +; GFX9-NEXT: s_lshr_b32 s74, s27, 24 +; GFX9-NEXT: s_lshr_b32 s75, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s27, 8 +; GFX9-NEXT: s_lshr_b32 s76, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s26, 8 +; GFX9-NEXT: s_lshr_b32 s79, s25, 24 +; GFX9-NEXT: s_lshr_b32 s88, s25, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 8 +; GFX9-NEXT: s_lshr_b32 s89, s24, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 8 +; GFX9-NEXT: s_lshr_b32 s92, s23, 24 +; GFX9-NEXT: s_lshr_b32 s93, s23, 16 +; GFX9-NEXT: s_lshr_b32 s95, s23, 8 +; GFX9-NEXT: s_lshr_b32 s94, s22, 16 +; GFX9-NEXT: s_lshr_b32 s30, s22, 8 +; GFX9-NEXT: s_lshr_b32 s31, s21, 24 +; GFX9-NEXT: s_lshr_b32 s34, s21, 16 +; GFX9-NEXT: s_lshr_b32 s36, s21, 8 +; GFX9-NEXT: s_lshr_b32 s35, s20, 16 +; GFX9-NEXT: s_lshr_b32 s37, s20, 8 +; GFX9-NEXT: s_lshr_b32 s38, s19, 24 +; GFX9-NEXT: s_lshr_b32 s39, s19, 16 +; GFX9-NEXT: s_lshr_b32 s49, s19, 8 +; GFX9-NEXT: s_lshr_b32 s48, s18, 16 +; GFX9-NEXT: s_lshr_b32 s50, s18, 8 +; GFX9-NEXT: s_lshr_b32 s51, s17, 24 +; GFX9-NEXT: s_lshr_b32 s52, s17, 16 +; GFX9-NEXT: s_lshr_b32 s54, s17, 8 +; GFX9-NEXT: s_lshr_b32 s53, s16, 16 +; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB105_4 +; GFX9-NEXT: .LBB105_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v1, 0x200 +; GFX9-NEXT: v_pk_add_f16 v20, s17, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, s16, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, s19, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, s18, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s21, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s20, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s23, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s22, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s25, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s24, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s27, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s26, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s29, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s28, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s5, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s4, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX9-NEXT: s_branch .LBB105_5 +; GFX9-NEXT: .LBB105_3: +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: s_branch .LBB105_2 +; GFX9-NEXT: .LBB105_4: +; GFX9-NEXT: v_mov_b32_e32 v21, s44 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v21, s42 +; GFX9-NEXT: v_mov_b32_e32 v19, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v11, s20 +; GFX9-NEXT: v_mov_b32_e32 v12, s21 +; GFX9-NEXT: v_mov_b32_e32 v9, s22 +; GFX9-NEXT: v_mov_b32_e32 v10, s23 +; GFX9-NEXT: v_mov_b32_e32 v7, s24 +; GFX9-NEXT: v_mov_b32_e32 v8, s25 +; GFX9-NEXT: v_mov_b32_e32 v5, s26 +; GFX9-NEXT: v_mov_b32_e32 v6, s27 +; GFX9-NEXT: v_mov_b32_e32 v3, s28 +; GFX9-NEXT: v_mov_b32_e32 v4, s29 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v17, s55 +; GFX9-NEXT: v_mov_b32_e32 v62, s53 +; GFX9-NEXT: v_mov_b32_e32 v13, s54 +; GFX9-NEXT: v_mov_b32_e32 v60, s52 +; GFX9-NEXT: v_mov_b32_e32 v61, s51 +; GFX9-NEXT: v_mov_b32_e32 v58, s50 +; GFX9-NEXT: v_mov_b32_e32 v59, s48 +; GFX9-NEXT: v_mov_b32_e32 v57, s49 +; GFX9-NEXT: v_mov_b32_e32 v47, s39 +; GFX9-NEXT: v_mov_b32_e32 v56, s38 +; GFX9-NEXT: v_mov_b32_e32 v46, s37 +; GFX9-NEXT: v_mov_b32_e32 v45, s35 +; GFX9-NEXT: v_mov_b32_e32 v44, s36 +; GFX9-NEXT: v_mov_b32_e32 v42, s34 +; GFX9-NEXT: v_mov_b32_e32 v43, s31 +; GFX9-NEXT: v_mov_b32_e32 v41, s30 +; GFX9-NEXT: v_mov_b32_e32 v40, s94 +; GFX9-NEXT: v_mov_b32_e32 v55, s95 +; GFX9-NEXT: v_mov_b32_e32 v53, s93 +; GFX9-NEXT: v_mov_b32_e32 v54, s92 +; GFX9-NEXT: v_mov_b32_e32 v52, s91 +; GFX9-NEXT: v_mov_b32_e32 v51, s89 +; GFX9-NEXT: v_mov_b32_e32 v50, s90 +; GFX9-NEXT: v_mov_b32_e32 v48, s88 +; GFX9-NEXT: v_mov_b32_e32 v49, s79 +; GFX9-NEXT: v_mov_b32_e32 v39, s78 +; GFX9-NEXT: v_mov_b32_e32 v38, s76 +; GFX9-NEXT: v_mov_b32_e32 v37, s77 +; GFX9-NEXT: v_mov_b32_e32 v35, s75 +; GFX9-NEXT: v_mov_b32_e32 v36, s74 +; GFX9-NEXT: v_mov_b32_e32 v34, s73 +; GFX9-NEXT: v_mov_b32_e32 v33, s63 +; GFX9-NEXT: v_mov_b32_e32 v32, s72 +; GFX9-NEXT: v_mov_b32_e32 v30, s62 +; GFX9-NEXT: v_mov_b32_e32 v31, s61 +; GFX9-NEXT: v_mov_b32_e32 v29, s60 +; GFX9-NEXT: v_mov_b32_e32 v28, s58 +; GFX9-NEXT: v_mov_b32_e32 v27, s59 +; GFX9-NEXT: v_mov_b32_e32 v14, s57 +; GFX9-NEXT: v_mov_b32_e32 v18, s56 +; GFX9-NEXT: v_mov_b32_e32 v23, s12 +; GFX9-NEXT: v_mov_b32_e32 v24, s10 +; GFX9-NEXT: v_mov_b32_e32 v25, s8 +; GFX9-NEXT: v_mov_b32_e32 v26, s6 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v21, s40 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: .LBB105_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v26 +; GFX9-NEXT: v_or_b32_sdwa v19, v62, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v13, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v13, v59, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v57 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v12, v42, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v10, v53, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v9, v51, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX9-NEXT: v_or_b32_sdwa v8, v48, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v6, v35, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s55, v63, 15 +; GFX9-NEXT: v_readlane_b32 s54, v63, 14 +; GFX9-NEXT: v_readlane_b32 s53, v63, 13 +; GFX9-NEXT: v_readlane_b32 s52, v63, 12 +; GFX9-NEXT: v_readlane_b32 s51, v63, 11 +; GFX9-NEXT: v_readlane_b32 s50, v63, 10 +; GFX9-NEXT: v_readlane_b32 s49, v63, 9 +; GFX9-NEXT: v_readlane_b32 s48, v63, 8 +; GFX9-NEXT: v_readlane_b32 s39, v63, 7 +; GFX9-NEXT: v_readlane_b32 s38, v63, 6 +; GFX9-NEXT: v_readlane_b32 s37, v63, 5 +; GFX9-NEXT: v_readlane_b32 s36, v63, 4 +; GFX9-NEXT: v_readlane_b32 s35, v63, 3 +; GFX9-NEXT: v_readlane_b32 s34, v63, 2 +; GFX9-NEXT: v_readlane_b32 s31, v63, 1 +; GFX9-NEXT: v_readlane_b32 s30, v63, 0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v32f16_to_v64i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 10 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-TRUE16-NEXT: .LBB105_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, s17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: s_branch .LBB105_5 +; GFX11-TRUE16-NEXT: .LBB105_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr95_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB105_2 +; GFX11-TRUE16-NEXT: .LBB105_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, s31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s30 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, s95 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s93 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, s92 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s91 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s90 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s89 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s88 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s79 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s78 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s77 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s76 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s75 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s74 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s73 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s72 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s61 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s59 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s57 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s47 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s4 +; GFX11-TRUE16-NEXT: .LBB105_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xff, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v87, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xff, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, v85, v84 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v82, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v22, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, v80, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v17, v70 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v69, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v67, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v21, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v22, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v17, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v18, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v18, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v23, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v21, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v23, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v38, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v68, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v14, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v9, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v10, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v7, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v18, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v18, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v7, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v6 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[66:69], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v40, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v40, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v32f16_to_v64i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s42, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-FAKE16-NEXT: .LBB105_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v23 +; GFX11-FAKE16-NEXT: s_branch .LBB105_5 +; GFX11-FAKE16-NEXT: .LBB105_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: s_branch .LBB105_2 +; GFX11-FAKE16-NEXT: .LBB105_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s0 :: v_dual_mov_b32 v24, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v20, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s16 :: v_dual_mov_b32 v16, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s18 :: v_dual_mov_b32 v14, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s22 :: v_dual_mov_b32 v6, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s24 :: v_dual_mov_b32 v4, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s26 :: v_dual_mov_b32 v2, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s49 :: v_dual_mov_b32 v87, s39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s48 :: v_dual_mov_b32 v85, s38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, s37 :: v_dual_mov_b32 v83, s36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s34 :: v_dual_mov_b32 v81, s35 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s31 :: v_dual_mov_b32 v71, s30 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, vcc_hi :: v_dual_mov_b32 v69, s94 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s95 :: v_dual_mov_b32 v67, s93 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s92 :: v_dual_mov_b32 v65, s91 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s89 :: v_dual_mov_b32 v55, s90 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s88 :: v_dual_mov_b32 v53, s79 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s78 :: v_dual_mov_b32 v51, s76 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s77 :: v_dual_mov_b32 v49, s75 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s74 :: v_dual_mov_b32 v39, s73 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s63 :: v_dual_mov_b32 v37, s72 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s62 :: v_dual_mov_b32 v35, s61 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s60 :: v_dual_mov_b32 v33, s58 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s59 :: v_dual_mov_b32 v31, s57 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s56 :: v_dual_mov_b32 v29, s47 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v7, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s46 :: v_dual_mov_b32 v11, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s44 :: v_dual_mov_b32 v17, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s43 :: v_dual_mov_b32 v21, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s10 :: v_dual_mov_b32 v26, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s6 :: v_dual_mov_b32 v28, s4 +; GFX11-FAKE16-NEXT: .LBB105_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xff, v87 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xff, v85 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v87, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, v23, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v85, v84 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v87, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v69, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v67, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v83, v24, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v84, v19, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, v20, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v15, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v16, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v28, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v13, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v14, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v9, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v10, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v5, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v17, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v17, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v8 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[82:85], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v40, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i8_to_v32f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v26 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v22 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v43 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v60 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v58 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v57 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v56 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v42 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v40 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v63 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v62 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v61 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v41 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v59 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v29, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v31, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v32, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v33, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v34, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v5, v32 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v7, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v9, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v11, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v13, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v15, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v17, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v54, v19, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v5 -; GCN-NEXT: v_or_b32_e32 v40, v21, v44 -; GCN-NEXT: v_or_b32_e32 v22, v22, v45 -; GCN-NEXT: v_or_b32_e32 v41, v23, v46 -; GCN-NEXT: v_or_b32_e32 v24, v24, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v24 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: .LBB53_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v59 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v47, v1 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v46, v3 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v28 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v45, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v41 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v43 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v32, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v32, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v32, v15 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v32, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v32, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v32, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v32, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v32, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v32, v30 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v32, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v32, v16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v32, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v32, v0 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x300, v1 -; GCN-NEXT: v_add_i32_e32 v54, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v41, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v42, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v43, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v44, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v38, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v29 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v30 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v31 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v40 -; GCN-NEXT: .LBB53_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v55 -; GCN-NEXT: v_mov_b32_e32 v2, v39 -; GCN-NEXT: v_mov_b32_e32 v4, v35 -; GCN-NEXT: v_mov_b32_e32 v6, v51 -; GCN-NEXT: v_mov_b32_e32 v8, v33 -; GCN-NEXT: v_mov_b32_e32 v10, v37 -; GCN-NEXT: v_mov_b32_e32 v12, v49 -; GCN-NEXT: v_mov_b32_e32 v14, v53 -; GCN-NEXT: v_mov_b32_e32 v16, v32 -; GCN-NEXT: v_mov_b32_e32 v18, v34 -; GCN-NEXT: v_mov_b32_e32 v20, v36 -; GCN-NEXT: v_mov_b32_e32 v22, v38 -; GCN-NEXT: v_mov_b32_e32 v24, v48 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mov_b32_e32 v26, v50 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v28, v52 -; GCN-NEXT: v_mov_b32_e32 v30, v54 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i8_to_v32f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v29 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v6 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v35 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v36 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v37 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v39 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v48 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v49 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v39 +; SI-NEXT: v_or_b32_e32 v6, v6, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_or_b32_e32 v0, v6, v0 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v17, v17, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_or_b32_e32 v0, v0, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v58 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v11 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v63 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v49, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: .LBB106_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v1, v11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v7 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v7, v41, v7 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v14, v6 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v4, v47, v4 +; SI-NEXT: v_or_b32_e32 v5, v43, v5 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v4 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_or_b32_e32 v23, v57, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v12, v7 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v16, v7 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v18, v7 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v20, v7 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v26, v7 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v28, v31, v28 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v22, v7 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v63, v7 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v22, v59, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v24, v7 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v40, v7 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v62, v7 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v26, v58, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v61, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: .LBB106_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v8, v33 +; SI-NEXT: v_mov_b32_e32 v10, v37 +; SI-NEXT: v_mov_b32_e32 v12, v49 +; SI-NEXT: v_mov_b32_e32 v14, v53 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: v_mov_b32_e32 v18, v34 +; SI-NEXT: v_mov_b32_e32 v20, v36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v22, v38 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v24, v48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v26, v50 +; SI-NEXT: v_mov_b32_e32 v28, v52 +; SI-NEXT: v_mov_b32_e32 v30, v54 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v32f16: ; VI: ; %bb.0: @@ -41487,7 +83108,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB53_2 +; VI-NEXT: s_cbranch_execz .LBB106_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -41650,9 +83271,9 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: .LBB53_2: ; %Flow +; VI-NEXT: .LBB106_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB53_4 +; VI-NEXT: s_cbranch_execz .LBB106_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_add_u16_e32 v0, 3, v38 @@ -41807,7 +83428,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: v_add_u16_e32 v3, 0x300, v23 ; VI-NEXT: v_or_b32_e32 v3, v3, v19 -; VI-NEXT: .LBB53_4: ; %end +; VI-NEXT: .LBB106_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -41951,7 +83572,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB53_2 +; GFX9-NEXT: s_cbranch_execz .LBB106_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -42116,9 +83737,9 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: .LBB53_2: ; %Flow +; GFX9-NEXT: .LBB106_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB53_4 +; GFX9-NEXT: s_cbranch_execz .LBB106_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 @@ -42271,7 +83892,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_perm_b32 v0, v31, v0, s6 -; GFX9-NEXT: .LBB53_4: ; %end +; GFX9-NEXT: .LBB106_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -42402,15 +84023,15 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_4 -; GFX11-TRUE16-NEXT: .LBB53_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_4 +; GFX11-TRUE16-NEXT: .LBB106_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB53_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.l @@ -42541,8 +84162,8 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB53_2 -; GFX11-TRUE16-NEXT: .LBB53_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 +; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 @@ -42772,15 +84393,15 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB106_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_4 -; GFX11-FAKE16-NEXT: .LBB53_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB106_4 +; GFX11-FAKE16-NEXT: .LBB106_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB53_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v38 @@ -42927,8 +84548,8 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB53_2 -; GFX11-FAKE16-NEXT: .LBB53_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB106_2 +; GFX11-FAKE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v70, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3 @@ -43098,796 +84719,2886 @@ end: ret <32 x half> %phi } +define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i8_to_v32f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: v_readfirstlane_b32 s46, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v32, s30, 0 +; SI-NEXT: v_writelane_b32 v32, s31, 1 +; SI-NEXT: v_writelane_b32 v32, s34, 2 +; SI-NEXT: v_writelane_b32 v32, s35, 3 +; SI-NEXT: v_writelane_b32 v32, s36, 4 +; SI-NEXT: v_writelane_b32 v32, s37, 5 +; SI-NEXT: v_writelane_b32 v32, s38, 6 +; SI-NEXT: v_writelane_b32 v32, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s74, v30 +; SI-NEXT: v_readfirstlane_b32 s61, v29 +; SI-NEXT: v_readfirstlane_b32 s63, v28 +; SI-NEXT: v_readfirstlane_b32 s59, v27 +; SI-NEXT: v_readfirstlane_b32 s60, v26 +; SI-NEXT: v_readfirstlane_b32 s57, v25 +; SI-NEXT: v_readfirstlane_b32 s58, v24 +; SI-NEXT: v_readfirstlane_b32 s47, v23 +; SI-NEXT: v_readfirstlane_b32 s56, v22 +; SI-NEXT: v_readfirstlane_b32 s44, v21 +; SI-NEXT: v_readfirstlane_b32 s34, v19 +; SI-NEXT: v_readfirstlane_b32 s37, v18 +; SI-NEXT: v_readfirstlane_b32 s94, v17 +; SI-NEXT: v_readfirstlane_b32 s31, v16 +; SI-NEXT: v_readfirstlane_b32 s90, v15 +; SI-NEXT: v_readfirstlane_b32 s93, v14 +; SI-NEXT: v_readfirstlane_b32 s79, v13 +; SI-NEXT: v_readfirstlane_b32 s39, v12 +; SI-NEXT: v_readfirstlane_b32 s36, v11 +; SI-NEXT: v_readfirstlane_b32 s38, v10 +; SI-NEXT: v_readfirstlane_b32 s30, v9 +; SI-NEXT: v_readfirstlane_b32 s35, v8 +; SI-NEXT: v_readfirstlane_b32 s92, v7 +; SI-NEXT: v_readfirstlane_b32 s95, v6 +; SI-NEXT: v_readfirstlane_b32 s89, v5 +; SI-NEXT: v_readfirstlane_b32 s91, v4 +; SI-NEXT: v_readfirstlane_b32 s78, v3 +; SI-NEXT: v_readfirstlane_b32 s88, v2 +; SI-NEXT: v_readfirstlane_b32 s76, v1 +; SI-NEXT: v_readfirstlane_b32 s77, v0 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s6, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s9, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s7, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s11, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s8, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s12, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s10, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s15, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s13, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s41, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s14, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s43, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s40, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s45, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s42, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s73, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s62, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s75, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s72, v31 +; SI-NEXT: s_cbranch_scc0 .LBB107_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s77, 0xff +; SI-NEXT: s_lshl_b32 s5, s76, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s91, 0xff +; SI-NEXT: s_lshl_b32 s5, s89, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s95, 0xff +; SI-NEXT: s_lshl_b32 s5, s92, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s35, 0xff +; SI-NEXT: s_lshl_b32 s5, s30, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s38, 0xff +; SI-NEXT: s_lshl_b32 s5, s36, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s39, 0xff +; SI-NEXT: s_lshl_b32 s5, s79, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s93, 0xff +; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s31, 0xff +; SI-NEXT: s_lshl_b32 s5, s94, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s37, 0xff +; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_and_b32 s4, s46, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s56, 0xff +; SI-NEXT: s_lshl_b32 s5, s47, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_and_b32 s4, s58, 0xff +; SI-NEXT: s_lshl_b32 s5, s57, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_and_b32 s4, s60, 0xff +; SI-NEXT: s_lshl_b32 s5, s59, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_and_b32 s4, s63, 0xff +; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_and_b32 s4, s74, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_and_b32 s4, s75, 0xff +; SI-NEXT: s_lshl_b32 s5, s62, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_and_b32 s4, s73, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s14, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s5, s13, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_cbranch_execnz .LBB107_3 +; SI-NEXT: .LBB107_2: ; %cmp.true +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s11, 0xff +; SI-NEXT: s_lshl_b32 s6, s7, 8 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s12, 0xff +; SI-NEXT: s_lshl_b32 s7, s8, 8 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s15, 0xff +; SI-NEXT: s_lshl_b32 s8, s10, 8 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s41, 0xff +; SI-NEXT: s_lshl_b32 s9, s13, 8 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s43, 0xff +; SI-NEXT: s_lshl_b32 s10, s14, 8 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s45, 0xff +; SI-NEXT: s_lshl_b32 s11, s40, 8 +; SI-NEXT: s_add_i32 s73, s73, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s73, 0xff +; SI-NEXT: s_lshl_b32 s12, s42, 8 +; SI-NEXT: s_add_i32 s75, s75, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s75, 0xff +; SI-NEXT: s_lshl_b32 s13, s62, 8 +; SI-NEXT: s_add_i32 s74, s74, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s74, 0xff +; SI-NEXT: s_lshl_b32 s14, s72, 8 +; SI-NEXT: s_add_i32 s63, s63, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s63, 0xff +; SI-NEXT: s_lshl_b32 s15, s61, 8 +; SI-NEXT: s_add_i32 s60, s60, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s60, 0xff +; SI-NEXT: s_lshl_b32 s40, s59, 8 +; SI-NEXT: s_add_i32 s58, s58, 3 +; SI-NEXT: s_or_b32 s15, s40, s15 +; SI-NEXT: s_and_b32 s40, s58, 0xff +; SI-NEXT: s_lshl_b32 s41, s57, 8 +; SI-NEXT: s_add_i32 s56, s56, 3 +; SI-NEXT: s_or_b32 s40, s41, s40 +; SI-NEXT: s_and_b32 s41, s56, 0xff +; SI-NEXT: s_lshl_b32 s42, s47, 8 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_or_b32 s41, s42, s41 +; SI-NEXT: s_and_b32 s42, s46, 0xff +; SI-NEXT: s_lshl_b32 s43, s44, 8 +; SI-NEXT: s_add_i32 s37, s37, 3 +; SI-NEXT: s_or_b32 s42, s43, s42 +; SI-NEXT: s_and_b32 s43, s37, 0xff +; SI-NEXT: s_lshl_b32 s44, s34, 8 +; SI-NEXT: s_add_i32 s31, s31, 3 +; SI-NEXT: s_or_b32 s43, s44, s43 +; SI-NEXT: s_and_b32 s44, s31, 0xff +; SI-NEXT: s_lshl_b32 s45, s94, 8 +; SI-NEXT: s_add_i32 s93, s93, 3 +; SI-NEXT: s_or_b32 s44, s45, s44 +; SI-NEXT: s_and_b32 s45, s93, 0xff +; SI-NEXT: s_lshl_b32 s46, s90, 8 +; SI-NEXT: s_add_i32 s39, s39, 3 +; SI-NEXT: s_or_b32 s45, s46, s45 +; SI-NEXT: s_and_b32 s46, s39, 0xff +; SI-NEXT: s_lshl_b32 s47, s79, 8 +; SI-NEXT: s_add_i32 s38, s38, 3 +; SI-NEXT: s_or_b32 s46, s47, s46 +; SI-NEXT: s_and_b32 s47, s38, 0xff +; SI-NEXT: s_lshl_b32 s56, s36, 8 +; SI-NEXT: s_add_i32 s35, s35, 3 +; SI-NEXT: s_or_b32 s47, s56, s47 +; SI-NEXT: s_and_b32 s56, s35, 0xff +; SI-NEXT: s_lshl_b32 s57, s30, 8 +; SI-NEXT: s_add_i32 s95, s95, 3 +; SI-NEXT: s_or_b32 s56, s57, s56 +; SI-NEXT: s_and_b32 s57, s95, 0xff +; SI-NEXT: s_lshl_b32 s58, s92, 8 +; SI-NEXT: s_add_i32 s91, s91, 3 +; SI-NEXT: s_or_b32 s57, s58, s57 +; SI-NEXT: s_and_b32 s58, s91, 0xff +; SI-NEXT: s_lshl_b32 s59, s89, 8 +; SI-NEXT: s_add_i32 s88, s88, 3 +; SI-NEXT: s_or_b32 s58, s59, s58 +; SI-NEXT: s_and_b32 s59, s88, 0xff +; SI-NEXT: s_lshl_b32 s60, s78, 8 +; SI-NEXT: s_add_i32 s77, s77, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s59, s60, s59 +; SI-NEXT: s_and_b32 s60, s77, 0xff +; SI-NEXT: s_lshl_b32 s61, s76, 8 +; SI-NEXT: s_and_b32 s28, s28, 0xff +; SI-NEXT: s_lshl_b32 s29, s29, 8 +; SI-NEXT: s_and_b32 s26, s26, 0xff +; SI-NEXT: s_lshl_b32 s27, s27, 8 +; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_lshl_b32 s25, s25, 8 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_lshl_b32 s23, s23, 8 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s21, s21, 8 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s60, s61, s60 +; SI-NEXT: s_or_b32 s28, s29, s28 +; SI-NEXT: s_or_b32 s26, s27, s26 +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_addk_i32 s12, 0x300 +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_addk_i32 s14, 0x300 +; SI-NEXT: s_addk_i32 s15, 0x300 +; SI-NEXT: s_addk_i32 s40, 0x300 +; SI-NEXT: s_addk_i32 s41, 0x300 +; SI-NEXT: s_addk_i32 s42, 0x300 +; SI-NEXT: s_addk_i32 s43, 0x300 +; SI-NEXT: s_addk_i32 s44, 0x300 +; SI-NEXT: s_addk_i32 s45, 0x300 +; SI-NEXT: s_addk_i32 s46, 0x300 +; SI-NEXT: s_addk_i32 s47, 0x300 +; SI-NEXT: s_addk_i32 s56, 0x300 +; SI-NEXT: s_addk_i32 s57, 0x300 +; SI-NEXT: s_addk_i32 s58, 0x300 +; SI-NEXT: s_addk_i32 s59, 0x300 +; SI-NEXT: s_addk_i32 s60, 0x300 +; SI-NEXT: s_addk_i32 s28, 0x300 +; SI-NEXT: s_addk_i32 s26, 0x300 +; SI-NEXT: s_addk_i32 s24, 0x300 +; SI-NEXT: s_addk_i32 s22, 0x300 +; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_addk_i32 s18, 0x300 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: .LBB107_3: ; %end +; SI-NEXT: v_readlane_b32 s39, v32, 7 +; SI-NEXT: v_readlane_b32 s38, v32, 6 +; SI-NEXT: v_readlane_b32 s37, v32, 5 +; SI-NEXT: v_readlane_b32 s36, v32, 4 +; SI-NEXT: v_readlane_b32 s35, v32, 3 +; SI-NEXT: v_readlane_b32 s34, v32, 2 +; SI-NEXT: v_readlane_b32 s31, v32, 1 +; SI-NEXT: v_readlane_b32 s30, v32, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB107_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB107_2 +; +; VI-LABEL: bitcast_v64i8_to_v32f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v44, 8, v27 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v32 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v34 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB107_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v10, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v26, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v13 +; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v62 +; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v53, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_mov_b32_e32 v19, v57 +; VI-NEXT: v_mov_b32_e32 v57, v15 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: v_mov_b32_e32 v42, v17 +; VI-NEXT: v_mov_b32_e32 v35, v16 +; VI-NEXT: v_mov_b32_e32 v41, v21 +; VI-NEXT: v_mov_b32_e32 v38, v20 +; VI-NEXT: v_mov_b32_e32 v39, v25 +; VI-NEXT: v_mov_b32_e32 v34, v24 +; VI-NEXT: v_mov_b32_e32 v32, v29 +; VI-NEXT: v_mov_b32_e32 v31, v28 +; VI-NEXT: v_mov_b32_e32 v23, v50 +; VI-NEXT: v_mov_b32_e32 v50, v30 +; VI-NEXT: v_mov_b32_e32 v27, v49 +; VI-NEXT: v_mov_b32_e32 v49, v55 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v59, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v58 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB107_3 +; VI-NEXT: .LBB107_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v19 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v37 +; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v50 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v10, 24, v32 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v36 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v56 +; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v43 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v38 +; VI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v42 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v34 +; VI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v41 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v31 +; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; VI-NEXT: v_lshlrev_b32_e32 v9, 24, v39 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s10, s29, 8 +; VI-NEXT: s_and_b32 s11, s28, 0xff +; VI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; VI-NEXT: v_lshlrev_b32_e32 v11, 24, v52 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s25, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s24, 0xff +; VI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; VI-NEXT: v_lshlrev_b32_e32 v12, 24, v61 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v48 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s11, s20, 0xff +; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v51 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: v_or_b32_sdwa v38, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s8, s11 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: s_or_b32 s7, s7, s11 +; VI-NEXT: s_and_b32 s13, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 24 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v53 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v62 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v39, v49, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_and_b32 s12, s22, 0xff +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v19 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 +; VI-NEXT: s_and_b32 s11, s26, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s8, 0xffff +; VI-NEXT: s_lshl_b32 s8, s12, 16 +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: v_add_u32_e32 v48, vcc, 3, v59 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s8, s11, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v63 +; VI-NEXT: v_or_b32_sdwa v48, v54, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s9, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s10, 0xffff +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v37, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v19, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v30 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v26, v19, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v58 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; VI-NEXT: v_or_b32_e32 v3, v3, v13 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v22 +; VI-NEXT: v_or_b32_e32 v3, s7, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v23, v23, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v27, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v23 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x300, v27 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_and_b32_e32 v33, 0xff, v34 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; VI-NEXT: v_or_b32_e32 v4, v4, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; VI-NEXT: v_or_b32_e32 v5, v5, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; VI-NEXT: v_or_b32_e32 v6, v6, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 +; VI-NEXT: v_or_b32_e32 v7, v7, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; VI-NEXT: v_or_b32_e32 v8, v8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; VI-NEXT: v_or_b32_e32 v9, v9, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 +; VI-NEXT: v_or_b32_e32 v10, v10, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 +; VI-NEXT: v_or_b32_e32 v11, v11, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; VI-NEXT: v_or_b32_e32 v2, v2, v13 +; VI-NEXT: v_or_b32_sdwa v2, v2, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v1, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 +; VI-NEXT: v_or_b32_sdwa v5, v5, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v12, v12, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_or_b32_sdwa v0, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v19 +; VI-NEXT: v_or_b32_sdwa v7, v7, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: .LBB107_3: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB107_4: +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v23, v50 +; VI-NEXT: v_mov_b32_e32 v19, v57 +; VI-NEXT: v_mov_b32_e32 v27, v49 +; VI-NEXT: v_mov_b32_e32 v36, v13 +; VI-NEXT: v_mov_b32_e32 v42, v17 +; VI-NEXT: v_mov_b32_e32 v41, v21 +; VI-NEXT: v_mov_b32_e32 v39, v25 +; VI-NEXT: v_mov_b32_e32 v32, v29 +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_mov_b32_e32 v35, v16 +; VI-NEXT: v_mov_b32_e32 v38, v20 +; VI-NEXT: v_mov_b32_e32 v34, v24 +; VI-NEXT: v_mov_b32_e32 v31, v28 +; VI-NEXT: v_mov_b32_e32 v50, v30 +; VI-NEXT: v_mov_b32_e32 v49, v55 +; VI-NEXT: v_mov_b32_e32 v57, v15 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB107_2 +; +; GFX9-LABEL: bitcast_v64i8_to_v32f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v34, v30 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_mov_b32_e32 v51, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v51 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v32 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v38 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v31 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v33 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v35 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v49 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v16, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v37, v24 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: v_mov_b32_e32 v55, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v19, v13 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: v_mov_b32_e32 v42, v15 +; GFX9-NEXT: v_mov_b32_e32 v27, v25 +; GFX9-NEXT: v_mov_b32_e32 v30, v18 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: v_mov_b32_e32 v39, v26 +; GFX9-NEXT: v_mov_b32_e32 v35, v28 +; GFX9-NEXT: v_mov_b32_e32 v54, v31 +; GFX9-NEXT: v_mov_b32_e32 v31, v51 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v18, v22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v20, v24 +; GFX9-NEXT: s_cbranch_execnz .LBB107_3 +; GFX9-NEXT: .LBB107_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v13, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v15, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s10, s11, s10 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v16, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v21, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v36, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v22 +; GFX9-NEXT: .LBB107_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB107_4: +; GFX9-NEXT: v_mov_b32_e32 v30, v18 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v54, v31 +; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_mov_b32_e32 v39, v26 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, v22 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v35, v28 +; GFX9-NEXT: v_mov_b32_e32 v37, v24 +; GFX9-NEXT: v_mov_b32_e32 v31, v51 +; GFX9-NEXT: v_mov_b32_e32 v27, v25 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v42, v15 +; GFX9-NEXT: v_mov_b32_e32 v19, v13 +; GFX9-NEXT: v_mov_b32_e32 v55, v11 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB107_2 +; +; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, v96, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v86, 16, v87 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v96, 16, v97 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB107_3 +; GFX11-TRUE16-NEXT: .LBB107_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v28, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v20, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v18, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v23, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v25, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v37, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v2, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: .LBB107_3: ; %end +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB107_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB107_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v32f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v35 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v39 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v38 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v31 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v5, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v6, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v24 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB107_3 +; GFX11-FAKE16-NEXT: .LBB107_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v68 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v64 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v16 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v71, v5 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v66, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v28 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v29, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v27, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v25, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v21, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v55, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v54, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v17, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v53, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v33 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v52, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v51, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v50, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v49, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v35 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v80 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v83, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v48, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v39, v4 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v85, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB107_3: ; %end +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB107_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB107_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v32bf16_to_v64i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v31 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB54_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v35 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v37 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v50 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v55 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v28 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v29 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v57, v5, v34, 16 -; GCN-NEXT: v_alignbit_b32 v56, v14, v32, 16 -; GCN-NEXT: v_alignbit_b32 v47, v11, v36, 16 -; GCN-NEXT: v_alignbit_b32 v45, v10, v7, 16 -; GCN-NEXT: v_alignbit_b32 v41, v13, v38, 16 -; GCN-NEXT: v_alignbit_b32 v53, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v16, v49, 16 -; GCN-NEXT: v_alignbit_b32 v39, v6, v15, 16 -; GCN-NEXT: v_alignbit_b32 v33, v21, v17, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v19, 16 -; GCN-NEXT: v_alignbit_b32 v26, v24, v20, 16 -; GCN-NEXT: v_alignbit_b32 v24, v3, v22, 16 -; GCN-NEXT: v_alignbit_b32 v21, v51, v23, 16 -; GCN-NEXT: v_alignbit_b32 v16, v2, v25, 16 -; GCN-NEXT: v_alignbit_b32 v13, v52, v27, 16 -; GCN-NEXT: v_alignbit_b32 v11, v1, v30, 16 -; GCN-NEXT: v_alignbit_b32 v5, v56, v57, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v56, v57, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v61, v56, v57, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v45, v47, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v45, v47, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v60, v45, v47, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v53, v41, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v53, v41, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v59, v53, v41, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 8, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v31 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v11 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: .LBB54_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB54_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v46 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v44 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v42 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v35 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v39, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v20 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v31 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v21, 24, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v16 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v13 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v37 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v35 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v13, v26, v33, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v11, v1, v34, 16 -; GCN-NEXT: v_alignbit_b32 v21, v27, v5, 16 -; GCN-NEXT: v_alignbit_b32 v16, v2, v36, 16 -; GCN-NEXT: v_alignbit_b32 v26, v28, v38, 16 -; GCN-NEXT: v_alignbit_b32 v24, v3, v39, 16 -; GCN-NEXT: v_alignbit_b32 v33, v29, v17, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v19, 16 -; GCN-NEXT: v_alignbit_b32 v48, v30, v18, 16 -; GCN-NEXT: v_alignbit_b32 v39, v6, v15, 16 -; GCN-NEXT: v_alignbit_b32 v41, v32, v20, 16 -; GCN-NEXT: v_alignbit_b32 v53, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v47, v49, v22, 16 -; GCN-NEXT: v_alignbit_b32 v45, v10, v7, 16 -; GCN-NEXT: v_alignbit_b32 v57, v50, v23, 16 -; GCN-NEXT: v_alignbit_b32 v56, v14, v25, 16 -; GCN-NEXT: v_alignbit_b32 v5, v56, v57, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v56, v57, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v61, v56, v57, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v45, v47, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v45, v47, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v60, v45, v47, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v53, v41, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v53, v41, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v59, v53, v41, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 8, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v31 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v11 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: .LBB54_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v61 -; GCN-NEXT: v_or_b32_e32 v12, v7, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GCN-NEXT: v_or_b32_e32 v15, v7, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v60 -; GCN-NEXT: v_or_b32_e32 v17, v5, v7 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v63 -; GCN-NEXT: v_or_b32_e32 v18, v5, v7 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v59 -; GCN-NEXT: v_or_b32_e32 v19, v5, v7 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v5 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v14 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v62 -; GCN-NEXT: v_or_b32_e32 v20, v5, v7 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v48 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GCN-NEXT: v_or_b32_e32 v22, v7, v9 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v10 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v9 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v58 -; GCN-NEXT: v_or_b32_e32 v23, v9, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v10 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v33 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GCN-NEXT: v_or_b32_e32 v25, v10, v14 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v31 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GCN-NEXT: v_or_b32_e32 v28, v8, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 20, v0 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v31, 24, v14 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v26, 8, v26 -; GCN-NEXT: v_or_b32_e32 v26, v14, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v24, 8, v24 -; GCN-NEXT: v_or_b32_e32 v24, v6, v24 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v33 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v33 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v33 -; GCN-NEXT: v_or_b32_e32 v33, v21, v33 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v52, 24, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GCN-NEXT: v_or_b32_e32 v16, v4, v16 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v53 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v53 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v53 -; GCN-NEXT: v_or_b32_e32 v53, v13, v53 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v56, 0xff, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GCN-NEXT: v_or_b32_e32 v41, v3, v11 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v43, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v11 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v58, 0xff, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; GCN-NEXT: v_or_b32_e32 v32, v32, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v59, 0xff, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v45, 24, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v60, 24, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v48, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v27, v27, v48 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v29, v29, v49 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v31, v31, v51 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v39, v39, v54 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v48, v50, v42 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v49, v52, v44 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v50, v55, v46 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v51, v40, v47 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GCN-NEXT: v_or_b32_e32 v52, v43, v56 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v37, v37, v57 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v54, v45, v58 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff, v41 -; GCN-NEXT: v_or_b32_e32 v35, v60, v35 -; GCN-NEXT: v_or_b32_e32 v12, v12, v36 -; GCN-NEXT: v_or_b32_e32 v15, v15, v32 -; GCN-NEXT: v_or_b32_e32 v17, v17, v34 -; GCN-NEXT: v_or_b32_e32 v18, v18, v38 -; GCN-NEXT: v_or_b32_e32 v19, v19, v27 -; GCN-NEXT: v_or_b32_e32 v20, v20, v29 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_or_b32_e32 v23, v23, v39 -; GCN-NEXT: v_or_b32_e32 v25, v25, v48 -; GCN-NEXT: v_or_b32_e32 v27, v28, v49 -; GCN-NEXT: v_or_b32_e32 v26, v26, v50 -; GCN-NEXT: v_or_b32_e32 v24, v24, v51 -; GCN-NEXT: v_or_b32_e32 v28, v33, v52 -; GCN-NEXT: v_or_b32_e32 v16, v16, v37 -; GCN-NEXT: v_or_b32_e32 v29, v53, v54 -; GCN-NEXT: v_or_b32_e32 v31, v55, v35 -; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32bf16_to_v64i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v30 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v29 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB108_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v31 +; SI-NEXT: v_alignbit_b32 v48, v1, v38, 16 +; SI-NEXT: v_alignbit_b32 v50, v37, v35, 16 +; SI-NEXT: v_alignbit_b32 v1, v50, v48, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v50, v48, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v50, v48, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_alignbit_b32 v23, v1, v52, 16 +; SI-NEXT: v_alignbit_b32 v21, v19, v49, 16 +; SI-NEXT: v_alignbit_b32 v1, v21, v23, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v23, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v23, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_alignbit_b32 v17, v1, v55, 16 +; SI-NEXT: v_alignbit_b32 v18, v16, v53, 16 +; SI-NEXT: v_alignbit_b32 v1, v18, v17, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v18, v17, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v18, v17, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v34 +; SI-NEXT: v_alignbit_b32 v14, v1, v42, 16 +; SI-NEXT: v_alignbit_b32 v15, v13, v40, 16 +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_alignbit_b32 v11, v1, v45, 16 +; SI-NEXT: v_alignbit_b32 v12, v10, v43, 16 +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_alignbit_b32 v8, v1, v47, 16 +; SI-NEXT: v_alignbit_b32 v9, v7, v24, 16 +; SI-NEXT: v_alignbit_b32 v1, v9, v8, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v9, v8, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v9, v8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v25 +; SI-NEXT: v_alignbit_b32 v5, v1, v56, 16 +; SI-NEXT: v_alignbit_b32 v6, v4, v28, 16 +; SI-NEXT: v_alignbit_b32 v1, v6, v5, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v6, v5, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v6, v5, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_alignbit_b32 v2, v1, v57, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22 +; SI-NEXT: v_alignbit_b32 v3, v1, v29, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v18 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v30 +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v34 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v9 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v15 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v6 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 8 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v39 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v3 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v31 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v21 +; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v33 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v12 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: .LBB108_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB108_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_alignbit_b32 v23, v20, v19, 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v36 +; SI-NEXT: v_alignbit_b32 v21, v19, v20, 16 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_alignbit_b32 v48, v30, v20, 16 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v50, v37, v20, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v20, v50, v48, 24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v41 +; SI-NEXT: v_alignbit_b32 v17, v17, v16, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v50, v48, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v50, v48, 8 +; SI-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v21, v23, 24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v44 +; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v28 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v21, v23, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v34 +; SI-NEXT: v_alignbit_b32 v18, v16, v18, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v21, v23, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v13 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v18, v17, 24 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v11, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v43 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v18, v17, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39 +; SI-NEXT: v_alignbit_b32 v15, v13, v15, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v18, v17, 8 +; SI-NEXT: v_alignbit_b32 v8, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v10 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v15, v14, 24 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v24 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v15, v14, 16 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v12, v10, v12, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v15, v14, 8 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v7 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v12, v11, 24 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v12, v11, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 +; SI-NEXT: v_alignbit_b32 v9, v7, v9, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v12, v11, 8 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v4 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v9, v8, 24 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v25 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v6, v4, v6, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v9, v8, 8 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v3, v1, v3, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 8 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 24 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v27 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v18 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v9 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v15 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v21 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v30 +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29 +; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v26 +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v24 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v12 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v3 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: .LBB108_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v63 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v20, v20, v24 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v59 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v62 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v60 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v61 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v64i8: ; VI: ; %bb.0: @@ -43961,7 +87672,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB54_2 +; VI-NEXT: s_cbranch_execz .LBB108_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -44014,9 +87725,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] -; VI-NEXT: .LBB54_2: ; %Flow +; VI-NEXT: .LBB108_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB54_4 +; VI-NEXT: s_cbranch_execz .LBB108_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 @@ -44357,7 +88068,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: .LBB54_4: ; %end +; VI-NEXT: .LBB108_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 @@ -44566,7 +88277,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB54_2 +; GFX9-NEXT: s_cbranch_execz .LBB108_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -44619,9 +88330,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: .LBB54_2: ; %Flow +; GFX9-NEXT: .LBB108_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB54_4 +; GFX9-NEXT: s_cbranch_execz .LBB108_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 @@ -44932,7 +88643,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v28 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; GFX9-NEXT: .LBB54_4: ; %end +; GFX9-NEXT: .LBB108_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 @@ -45118,7 +88829,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -45176,9 +88887,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v15.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v16.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v16.h -; GFX11-TRUE16-NEXT: .LBB54_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v4 @@ -45476,7 +89187,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v13 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v7 -; GFX11-TRUE16-NEXT: .LBB54_4: ; %end +; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v28.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v113.l @@ -45689,7 +89400,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -45739,9 +89450,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB54_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB108_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v1 @@ -46062,7 +89773,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 8, v28 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 8, v27 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v26 -; GFX11-FAKE16-NEXT: .LBB54_4: ; %end +; GFX11-FAKE16-NEXT: .LBB108_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -46232,779 +89943,4500 @@ end: ret <64 x i8> %phi } +define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32bf16_to_v64i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s17 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB109_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; SI-NEXT: v_alignbit_b32 v27, v1, v3, 16 +; SI-NEXT: v_alignbit_b32 v30, v24, v2, 16 +; SI-NEXT: v_alignbit_b32 v1, v30, v27, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v27, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v27, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_alignbit_b32 v21, v1, v6, 16 +; SI-NEXT: v_alignbit_b32 v19, v17, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v19, v21, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v19, v21, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v19, v21, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_alignbit_b32 v15, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v16, v13, v7, 16 +; SI-NEXT: v_alignbit_b32 v1, v16, v15, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v16, v15, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v16, v15, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v42 +; SI-NEXT: v_alignbit_b32 v10, v1, v11, 16 +; SI-NEXT: v_alignbit_b32 v11, v9, v20, 16 +; SI-NEXT: v_alignbit_b32 v1, v11, v10, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v11, v10, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v11, v10, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_alignbit_b32 v6, v1, v28, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_alignbit_b32 v3, v1, v34, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v56 +; SI-NEXT: v_alignbit_b32 v2, v1, v35, 16 +; SI-NEXT: v_alignbit_b32 v8, v7, v33, 16 +; SI-NEXT: v_alignbit_b32 v4, v8, v2, 24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_alignbit_b32 v1, v1, v39, 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44 +; SI-NEXT: v_alignbit_b32 v5, v4, v32, 16 +; SI-NEXT: v_mov_b32_e32 v31, v23 +; SI-NEXT: v_alignbit_b32 v20, v18, v23, 16 +; SI-NEXT: v_alignbit_b32 v14, v12, v29, 16 +; SI-NEXT: v_alignbit_b32 v23, v5, v1, 24 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_alignbit_b32 v36, v20, v6, 24 +; SI-NEXT: v_alignbit_b32 v25, v14, v3, 24 +; SI-NEXT: v_alignbit_b32 v50, v8, v2, 16 +; SI-NEXT: v_mov_b32_e32 v53, v32 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v23, v5, v1, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v1, 8 +; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16 +; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8 +; SI-NEXT: v_mov_b32_e32 v35, v29 +; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8 +; SI-NEXT: v_mov_b32_e32 v37, v33 +; SI-NEXT: v_alignbit_b32 v51, v8, v2, 8 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v41 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19 +; SI-NEXT: v_mov_b32_e32 v28, v26 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v26 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v42 +; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 +; SI-NEXT: v_mov_b32_e32 v29, v43 +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v43 +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20 +; SI-NEXT: v_mov_b32_e32 v34, v44 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14 +; SI-NEXT: v_mov_b32_e32 v33, v56 +; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v56 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v48 +; SI-NEXT: v_mov_b32_e32 v48, v32 +; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: v_mov_b32_e32 v50, v25 +; SI-NEXT: v_mov_b32_e32 v25, v36 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; SI-NEXT: s_cbranch_execnz .LBB109_3 +; SI-NEXT: .LBB109_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_alignbit_b32 v5, v4, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v43 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v45, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v45 +; SI-NEXT: v_alignbit_b32 v48, v5, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v43 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v42 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v8, v7, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v32, v8, v2, 16 +; SI-NEXT: v_alignbit_b32 v51, v8, v2, 8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v15, v15, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v21, v19, v17, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v3, v6, v3, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v14, v12, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v50, v14, v3, 24 +; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v17 +; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v56 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 +; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16 +; SI-NEXT: v_alignbit_b32 v16, v13, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v9, v6, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v20, v18, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v25, v20, v6, 24 +; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16 +; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v10, v10, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v59 +; SI-NEXT: v_alignbit_b32 v30, v24, v22, 16 +; SI-NEXT: v_alignbit_b32 v22, v30, v27, 24 +; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v30 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v30, v27, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v30, v27, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v19, v21, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v19, v21, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v19, v21, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v16, v15, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v16, v15, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v16, v15, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v47, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v47 +; SI-NEXT: v_alignbit_b32 v11, v9, v11, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 24 +; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v47 +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v8, v2, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v5, v1, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v5, v1, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v59 +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v45 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v27, v27, v36 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v35, 0xff, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v23 +; SI-NEXT: v_or_b32_e32 v33, v33, v35 +; SI-NEXT: v_or_b32_e32 v27, v27, v33 +; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v62 +; SI-NEXT: v_or_b32_e32 v27, v27, v30 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v22, v27, v22 +; SI-NEXT: buffer_store_dword v22, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v46 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v41 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v17, v21, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v57 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v61 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_add_i32_e32 v15, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v63 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v40 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v25 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v47 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v59 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v54 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v50 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v60 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v56 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB109_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: v_mov_b32_e32 v53, v32 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v37, v33 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v33, v56 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v29 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v34, v44 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v31, v23 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v29, v43 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v28, v26 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v23, v41 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_branch .LBB109_2 +; +; VI-LABEL: bitcast_v32bf16_to_v64i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v63, s30, 0 +; VI-NEXT: v_writelane_b32 v63, s31, 1 +; VI-NEXT: v_writelane_b32 v63, s34, 2 +; VI-NEXT: v_writelane_b32 v63, s35, 3 +; VI-NEXT: v_writelane_b32 v63, s36, 4 +; VI-NEXT: v_writelane_b32 v63, s37, 5 +; VI-NEXT: v_writelane_b32 v63, s38, 6 +; VI-NEXT: v_writelane_b32 v63, s39, 7 +; VI-NEXT: v_writelane_b32 v63, s48, 8 +; VI-NEXT: v_writelane_b32 v63, s49, 9 +; VI-NEXT: v_writelane_b32 v63, s50, 10 +; VI-NEXT: v_writelane_b32 v63, s51, 11 +; VI-NEXT: v_writelane_b32 v63, s52, 12 +; VI-NEXT: v_writelane_b32 v63, s53, 13 +; VI-NEXT: v_writelane_b32 v63, s54, 14 +; VI-NEXT: v_writelane_b32 v63, s55, 15 +; VI-NEXT: v_writelane_b32 v63, s64, 16 +; VI-NEXT: v_writelane_b32 v63, s65, 17 +; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB109_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s59, s5, 8 +; VI-NEXT: s_lshr_b32 s58, s4, 16 +; VI-NEXT: s_lshr_b32 s60, s4, 8 +; VI-NEXT: s_lshr_b32 s61, s29, 24 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s72, s29, 8 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s73, s28, 8 +; VI-NEXT: s_lshr_b32 s74, s27, 24 +; VI-NEXT: s_lshr_b32 s75, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s27, 8 +; VI-NEXT: s_lshr_b32 s76, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 8 +; VI-NEXT: s_lshr_b32 s79, s25, 24 +; VI-NEXT: s_lshr_b32 s88, s25, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 8 +; VI-NEXT: s_lshr_b32 s89, s24, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 8 +; VI-NEXT: s_lshr_b32 s30, s23, 24 +; VI-NEXT: s_lshr_b32 s31, s23, 16 +; VI-NEXT: s_lshr_b32 s35, s23, 8 +; VI-NEXT: s_lshr_b32 s34, s22, 16 +; VI-NEXT: s_lshr_b32 s36, s22, 8 +; VI-NEXT: s_lshr_b32 s37, s21, 24 +; VI-NEXT: s_lshr_b32 s38, s21, 16 +; VI-NEXT: s_lshr_b32 s48, s21, 8 +; VI-NEXT: s_lshr_b32 s39, s20, 16 +; VI-NEXT: s_lshr_b32 s49, s20, 8 +; VI-NEXT: s_lshr_b32 s50, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s53, s19, 8 +; VI-NEXT: s_lshr_b32 s52, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s55, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s66, s17, 8 +; VI-NEXT: s_lshr_b32 s65, s16, 16 +; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB109_4 +; VI-NEXT: .LBB109_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_mov_b32_e32 v15, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s6, v15 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s6, v15 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s6, s16, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s6, v15 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v15 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s6, s19, 16 +; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; VI-NEXT: v_add_f32_e32 v3, s6, v15 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s6, v15 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s6, s18, 16 +; VI-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; VI-NEXT: v_add_f32_e32 v3, s6, v15 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s6, v15 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: s_lshl_b32 s6, s21, 16 +; VI-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; VI-NEXT: v_add_f32_e32 v5, s6, v15 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_f32_e32 v6, s6, v15 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: s_lshl_b32 s6, s20, 16 +; VI-NEXT: v_alignbit_b32 v6, v6, v5, 16 +; VI-NEXT: v_add_f32_e32 v5, s6, v15 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; VI-NEXT: v_add_f32_e32 v7, s6, v15 +; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: s_lshl_b32 s6, s23, 16 +; VI-NEXT: v_alignbit_b32 v5, v7, v5, 16 +; VI-NEXT: v_add_f32_e32 v7, s6, v15 +; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_add_f32_e32 v8, s6, v15 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: s_lshl_b32 s6, s22, 16 +; VI-NEXT: v_alignbit_b32 v8, v8, v7, 16 +; VI-NEXT: v_add_f32_e32 v7, s6, v15 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; VI-NEXT: v_add_f32_e32 v9, s6, v15 +; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: s_lshl_b32 s6, s25, 16 +; VI-NEXT: v_alignbit_b32 v7, v9, v7, 16 +; VI-NEXT: v_add_f32_e32 v9, s6, v15 +; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; VI-NEXT: v_add_f32_e32 v10, s6, v15 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: s_lshl_b32 s6, s24, 16 +; VI-NEXT: v_alignbit_b32 v10, v10, v9, 16 +; VI-NEXT: v_add_f32_e32 v9, s6, v15 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc +; VI-NEXT: v_add_f32_e32 v11, s6, v15 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: s_lshl_b32 s6, s27, 16 +; VI-NEXT: v_alignbit_b32 v9, v11, v9, 16 +; VI-NEXT: v_add_f32_e32 v11, s6, v15 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_add_f32_e32 v12, s6, v15 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: s_lshl_b32 s6, s26, 16 +; VI-NEXT: v_alignbit_b32 v12, v12, v11, 16 +; VI-NEXT: v_add_f32_e32 v11, s6, v15 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc +; VI-NEXT: v_add_f32_e32 v13, s6, v15 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: s_lshl_b32 s6, s29, 16 +; VI-NEXT: v_alignbit_b32 v11, v13, v11, 16 +; VI-NEXT: v_add_f32_e32 v13, s6, v15 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc +; VI-NEXT: v_add_f32_e32 v14, s6, v15 +; VI-NEXT: v_bfe_u32 v16, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v14 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: s_lshl_b32 s6, s28, 16 +; VI-NEXT: v_alignbit_b32 v14, v14, v13, 16 +; VI-NEXT: v_add_f32_e32 v13, s6, v15 +; VI-NEXT: v_bfe_u32 v16, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v13 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v16, v17, vcc +; VI-NEXT: v_add_f32_e32 v16, s6, v15 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: s_lshl_b32 s6, s5, 16 +; VI-NEXT: v_alignbit_b32 v13, v16, v13, 16 +; VI-NEXT: v_add_f32_e32 v16, s6, v15 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_add_f32_e32 v17, s5, v15 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: s_lshl_b32 s5, s4, 16 +; VI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; VI-NEXT: v_add_f32_e32 v17, s5, v15 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: s_and_b32 s4, s4, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v15, s4, v15 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v17, 16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 +; VI-NEXT: s_branch .LBB109_5 +; VI-NEXT: .LBB109_3: +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: s_branch .LBB109_2 +; VI-NEXT: .LBB109_4: +; VI-NEXT: v_mov_b32_e32 v19, s44 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v19, s42 +; VI-NEXT: v_mov_b32_e32 v1, s16 +; VI-NEXT: v_mov_b32_e32 v2, s17 +; VI-NEXT: v_mov_b32_e32 v3, s18 +; VI-NEXT: v_mov_b32_e32 v4, s19 +; VI-NEXT: v_mov_b32_e32 v5, s20 +; VI-NEXT: v_mov_b32_e32 v6, s21 +; VI-NEXT: v_mov_b32_e32 v7, s22 +; VI-NEXT: v_mov_b32_e32 v8, s23 +; VI-NEXT: v_mov_b32_e32 v9, s24 +; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: v_mov_b32_e32 v11, s26 +; VI-NEXT: v_mov_b32_e32 v12, s27 +; VI-NEXT: v_mov_b32_e32 v13, s28 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: v_mov_b32_e32 v16, s5 +; VI-NEXT: v_mov_b32_e32 v18, s67 +; VI-NEXT: v_mov_b32_e32 v62, s65 +; VI-NEXT: v_mov_b32_e32 v17, s66 +; VI-NEXT: v_mov_b32_e32 v60, s64 +; VI-NEXT: v_mov_b32_e32 v61, s55 +; VI-NEXT: v_mov_b32_e32 v58, s54 +; VI-NEXT: v_mov_b32_e32 v59, s52 +; VI-NEXT: v_mov_b32_e32 v57, s53 +; VI-NEXT: v_mov_b32_e32 v47, s51 +; VI-NEXT: v_mov_b32_e32 v56, s50 +; VI-NEXT: v_mov_b32_e32 v46, s49 +; VI-NEXT: v_mov_b32_e32 v45, s39 +; VI-NEXT: v_mov_b32_e32 v44, s48 +; VI-NEXT: v_mov_b32_e32 v42, s38 +; VI-NEXT: v_mov_b32_e32 v43, s37 +; VI-NEXT: v_mov_b32_e32 v41, s36 +; VI-NEXT: v_mov_b32_e32 v40, s34 +; VI-NEXT: v_mov_b32_e32 v55, s35 +; VI-NEXT: v_mov_b32_e32 v53, s31 +; VI-NEXT: v_mov_b32_e32 v54, s30 +; VI-NEXT: v_mov_b32_e32 v52, s91 +; VI-NEXT: v_mov_b32_e32 v51, s89 +; VI-NEXT: v_mov_b32_e32 v50, s90 +; VI-NEXT: v_mov_b32_e32 v48, s88 +; VI-NEXT: v_mov_b32_e32 v49, s79 +; VI-NEXT: v_mov_b32_e32 v39, s78 +; VI-NEXT: v_mov_b32_e32 v38, s76 +; VI-NEXT: v_mov_b32_e32 v37, s77 +; VI-NEXT: v_mov_b32_e32 v35, s75 +; VI-NEXT: v_mov_b32_e32 v36, s74 +; VI-NEXT: v_mov_b32_e32 v34, s73 +; VI-NEXT: v_mov_b32_e32 v33, s63 +; VI-NEXT: v_mov_b32_e32 v32, s72 +; VI-NEXT: v_mov_b32_e32 v30, s62 +; VI-NEXT: v_mov_b32_e32 v31, s61 +; VI-NEXT: v_mov_b32_e32 v29, s60 +; VI-NEXT: v_mov_b32_e32 v28, s58 +; VI-NEXT: v_mov_b32_e32 v27, s59 +; VI-NEXT: v_mov_b32_e32 v25, s57 +; VI-NEXT: v_mov_b32_e32 v26, s56 +; VI-NEXT: v_mov_b32_e32 v21, s12 +; VI-NEXT: v_mov_b32_e32 v22, s10 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v19, s40 +; VI-NEXT: v_mov_b32_e32 v20, s14 +; VI-NEXT: .LBB109_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v62, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v58 +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v46 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v21 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v36 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_readlane_b32 s67, v63, 19 +; VI-NEXT: v_readlane_b32 s66, v63, 18 +; VI-NEXT: v_readlane_b32 s65, v63, 17 +; VI-NEXT: v_readlane_b32 s64, v63, 16 +; VI-NEXT: v_readlane_b32 s55, v63, 15 +; VI-NEXT: v_readlane_b32 s54, v63, 14 +; VI-NEXT: v_readlane_b32 s53, v63, 13 +; VI-NEXT: v_readlane_b32 s52, v63, 12 +; VI-NEXT: v_readlane_b32 s51, v63, 11 +; VI-NEXT: v_readlane_b32 s50, v63, 10 +; VI-NEXT: v_readlane_b32 s49, v63, 9 +; VI-NEXT: v_readlane_b32 s48, v63, 8 +; VI-NEXT: v_readlane_b32 s39, v63, 7 +; VI-NEXT: v_readlane_b32 s38, v63, 6 +; VI-NEXT: v_readlane_b32 s37, v63, 5 +; VI-NEXT: v_readlane_b32 s36, v63, 4 +; VI-NEXT: v_readlane_b32 s35, v63, 3 +; VI-NEXT: v_readlane_b32 s34, v63, 2 +; VI-NEXT: v_readlane_b32 s31, v63, 1 +; VI-NEXT: v_readlane_b32 s30, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v64i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v4, s30, 0 +; GFX9-NEXT: v_writelane_b32 v4, s31, 1 +; GFX9-NEXT: v_writelane_b32 v4, s34, 2 +; GFX9-NEXT: v_writelane_b32 v4, s35, 3 +; GFX9-NEXT: v_writelane_b32 v4, s36, 4 +; GFX9-NEXT: v_writelane_b32 v4, s37, 5 +; GFX9-NEXT: v_writelane_b32 v4, s38, 6 +; GFX9-NEXT: v_writelane_b32 v4, s39, 7 +; GFX9-NEXT: v_writelane_b32 v4, s48, 8 +; GFX9-NEXT: v_writelane_b32 v4, s49, 9 +; GFX9-NEXT: v_writelane_b32 v4, s50, 10 +; GFX9-NEXT: v_writelane_b32 v4, s51, 11 +; GFX9-NEXT: v_writelane_b32 v4, s52, 12 +; GFX9-NEXT: v_writelane_b32 v4, s53, 13 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_writelane_b32 v4, s54, 14 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: v_writelane_b32 v4, s55, 15 +; GFX9-NEXT: s_cbranch_scc0 .LBB109_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s92, s5, 24 +; GFX9-NEXT: s_lshr_b32 s91, s5, 16 +; GFX9-NEXT: s_lshr_b32 s93, s5, 8 +; GFX9-NEXT: s_lshr_b32 s94, s4, 16 +; GFX9-NEXT: s_lshr_b32 s95, s4, 8 +; GFX9-NEXT: s_lshr_b32 s30, s29, 24 +; GFX9-NEXT: s_lshr_b32 s90, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s29, 8 +; GFX9-NEXT: s_lshr_b32 s31, s28, 16 +; GFX9-NEXT: s_lshr_b32 s74, s28, 8 +; GFX9-NEXT: s_lshr_b32 s34, s27, 24 +; GFX9-NEXT: s_lshr_b32 s89, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s27, 8 +; GFX9-NEXT: s_lshr_b32 s35, s26, 16 +; GFX9-NEXT: s_lshr_b32 s72, s26, 8 +; GFX9-NEXT: s_lshr_b32 s36, s25, 24 +; GFX9-NEXT: s_lshr_b32 s88, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s25, 8 +; GFX9-NEXT: s_lshr_b32 s37, s24, 16 +; GFX9-NEXT: s_lshr_b32 s62, s24, 8 +; GFX9-NEXT: s_lshr_b32 s38, s23, 24 +; GFX9-NEXT: s_lshr_b32 s79, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s23, 8 +; GFX9-NEXT: s_lshr_b32 s39, s22, 16 +; GFX9-NEXT: s_lshr_b32 s60, s22, 8 +; GFX9-NEXT: s_lshr_b32 s48, s21, 24 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s21, 8 +; GFX9-NEXT: s_lshr_b32 s49, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s20, 8 +; GFX9-NEXT: s_lshr_b32 s50, s19, 24 +; GFX9-NEXT: s_lshr_b32 s77, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s19, 8 +; GFX9-NEXT: s_lshr_b32 s51, s18, 16 +; GFX9-NEXT: s_lshr_b32 s56, s18, 8 +; GFX9-NEXT: s_lshr_b32 s52, s17, 24 +; GFX9-NEXT: s_lshr_b32 s76, s17, 16 +; GFX9-NEXT: s_lshr_b32 s53, s17, 8 +; GFX9-NEXT: s_lshr_b32 s54, s16, 16 +; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB109_3 +; GFX9-NEXT: .LBB109_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s6, s17, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s76, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s17, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s17, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s16, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s16, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s16, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s19, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s46, s16, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s19, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s19, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s18, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s18, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s18, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s21, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s56, s18, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s78, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s21, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s21, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s20, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s20, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s20, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s23, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s58, s20, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s79, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s23, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s23, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s22, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s22, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s22, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s25, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s60, s22, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s88, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s25, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s25, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s24, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s24, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s24, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s27, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s62, s24, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s89, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s27, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s27, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s26, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s26, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s26, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s29, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s72, s26, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s90, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s29, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s29, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s28, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s28, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s28, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s5, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s74, s28, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s5, v1 +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: s_lshr_b32 s91, s6, 16 +; GFX9-NEXT: s_bfe_u32 s6, s5, 0x10010 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_add_i32 s8, s6, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s5, 22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s5, s5, s8 +; GFX9-NEXT: s_and_b32 s6, s4, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_lshr_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_bfe_u32 s6, s4, 0x10010 +; GFX9-NEXT: s_add_i32 s6, s6, s4 +; GFX9-NEXT: s_add_i32 s9, s6, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s4, 22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s4, s4, s9 +; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s47, s17, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s57, s19, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s59, s21, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s61, s23, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s63, s25, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s73, s27, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s75, s29, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s31, s5, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s30, s4, s8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[30:31], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[74:75], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[72:73], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[62:63], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[60:61], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[58:59], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[46:47], 24 +; GFX9-NEXT: s_lshr_b32 s92, s31, 24 +; GFX9-NEXT: s_lshr_b32 s93, s31, 8 +; GFX9-NEXT: s_lshr_b32 s94, s30, 16 +; GFX9-NEXT: s_lshr_b32 s95, s30, 8 +; GFX9-NEXT: s_lshr_b32 s30, s75, 24 +; GFX9-NEXT: s_lshr_b32 s75, s75, 8 +; GFX9-NEXT: s_lshr_b32 s31, s74, 16 +; GFX9-NEXT: s_lshr_b32 s74, s74, 8 +; GFX9-NEXT: s_lshr_b32 s34, s73, 24 +; GFX9-NEXT: s_lshr_b32 s73, s73, 8 +; GFX9-NEXT: s_lshr_b32 s35, s72, 16 +; GFX9-NEXT: s_lshr_b32 s72, s72, 8 +; GFX9-NEXT: s_lshr_b32 s36, s63, 24 +; GFX9-NEXT: s_lshr_b32 s63, s63, 8 +; GFX9-NEXT: s_lshr_b32 s37, s62, 16 +; GFX9-NEXT: s_lshr_b32 s62, s62, 8 +; GFX9-NEXT: s_lshr_b32 s38, s61, 24 +; GFX9-NEXT: s_lshr_b32 s61, s61, 8 +; GFX9-NEXT: s_lshr_b32 s39, s60, 16 +; GFX9-NEXT: s_lshr_b32 s60, s60, 8 +; GFX9-NEXT: s_lshr_b32 s48, s59, 24 +; GFX9-NEXT: s_lshr_b32 s59, s59, 8 +; GFX9-NEXT: s_lshr_b32 s49, s58, 16 +; GFX9-NEXT: s_lshr_b32 s58, s58, 8 +; GFX9-NEXT: s_lshr_b32 s50, s57, 24 +; GFX9-NEXT: s_lshr_b32 s57, s57, 8 +; GFX9-NEXT: s_lshr_b32 s51, s56, 16 +; GFX9-NEXT: s_lshr_b32 s56, s56, 8 +; GFX9-NEXT: s_lshr_b32 s52, s47, 24 +; GFX9-NEXT: s_lshr_b32 s53, s47, 8 +; GFX9-NEXT: s_lshr_b32 s54, s46, 16 +; GFX9-NEXT: s_lshr_b32 s55, s46, 8 +; GFX9-NEXT: .LBB109_3: ; %end +; GFX9-NEXT: s_and_b32 s7, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s55, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s54, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s44, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s53, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s76, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s52, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s56, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s51, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s42, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s57, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s50, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s58, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s49, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s40, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s59, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s78, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s48, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s60, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s14, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s61, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s79, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s38, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s62, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s37, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s12, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s63, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s88, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s36, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s72, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s35, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s73, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s89, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s34, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s74, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s31, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s75, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s90, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s30, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s95, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s7 +; GFX9-NEXT: s_and_b32 s7, s94, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_and_b32 s4, s5, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s93, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s91, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s92, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_readlane_b32 s55, v4, 15 +; GFX9-NEXT: v_readlane_b32 s54, v4, 14 +; GFX9-NEXT: v_readlane_b32 s53, v4, 13 +; GFX9-NEXT: v_readlane_b32 s52, v4, 12 +; GFX9-NEXT: v_readlane_b32 s51, v4, 11 +; GFX9-NEXT: v_readlane_b32 s50, v4, 10 +; GFX9-NEXT: v_readlane_b32 s49, v4, 9 +; GFX9-NEXT: v_readlane_b32 s48, v4, 8 +; GFX9-NEXT: v_readlane_b32 s39, v4, 7 +; GFX9-NEXT: v_readlane_b32 s38, v4, 6 +; GFX9-NEXT: v_readlane_b32 s37, v4, 5 +; GFX9-NEXT: v_readlane_b32 s36, v4, 4 +; GFX9-NEXT: v_readlane_b32 s35, v4, 3 +; GFX9-NEXT: v_readlane_b32 s34, v4, 2 +; GFX9-NEXT: v_readlane_b32 s31, v4, 1 +; GFX9-NEXT: v_readlane_b32 s30, v4, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB109_4: +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: s_branch .LBB109_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v64i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v17, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s30, 0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s49, 9 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB109_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB109_3 +; GFX11-TRUE16-NEXT: .LBB109_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s4, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s4, 0x10010 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s4 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s4, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s4, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s1, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s5, s1 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s1, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s1, s1, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s1, s46 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s4, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s4 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s4, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s0, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s0 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s0, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s3, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s0, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s5, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, s5 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s5, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s7, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s5, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s3, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s6, s3 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s3, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s3, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s5, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, s5 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s5, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s7, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s2, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, s2 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s7, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s2, s2, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, s6 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s6, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s6, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s8, s7, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s8, s7 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s7, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s6, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s17, s56 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s8, s7, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s8, s7 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s7, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s8, s7, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, s7 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s7, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s7, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s16, s6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s9, s8 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s7, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s9, s8 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s19, s7, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s31, s19, s57 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s9, s8 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, s8 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s30, s18, s7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s10, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s10, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s10, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s10, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s20, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s93, s21, s58 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s10, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s10, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s92, s20, s10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s89, s23, s59 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s25, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s22, s12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s25, s60 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s27, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s24, s13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s26, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s11, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s2, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s3, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 vcc_hi, s27, s61 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s4, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s4 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s4, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s4, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s5, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s6, s5 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s5, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s26, s5, s4 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[76:77], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[88:89], 24 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 vcc_lo, s26, s49 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[92:93], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[30:31], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[44:45], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[8:9], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[42:43], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, vcc_hi, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, vcc_hi, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s77, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s77, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s76, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s76, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s89, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s89, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s88, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s88, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s93, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s93, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s92, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s92, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s31, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s31, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s30, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s30, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s45, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s45, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s44, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s44, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s9, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s8, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s43, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s43, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], vcc, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, vcc_lo, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, vcc_lo, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s42, 8 +; GFX11-TRUE16-NEXT: .LBB109_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s49 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s40 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s48 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s39 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s38 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s37 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s36 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s47 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s35 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s34 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s31 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s14 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s45 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s30 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s95 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s94 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s12 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s93 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s57 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s92 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s91 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s90 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s89 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s58 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s88 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s79 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s78 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s59 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s76 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s75 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s73 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s60 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s72 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s63 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s61 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s62 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v17, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v17, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v17, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v17, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v17, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v17, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v17, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v17, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v17, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v17, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v17, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB109_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr95_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB109_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v64i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v17, s32 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s30, 0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s50, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s51, 10 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB109_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB109_3 +; GFX11-FAKE16-NEXT: .LBB109_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_and_b32 s4, s1, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s4, 0x10010 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s4 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s4, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s4, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s1, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s5, s1 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s1, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s1, s1, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s43, s1, s46 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s4, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s4 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s4, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s0, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s0 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s0, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s3, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s42, s0, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s6, s5, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, s5 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s5, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s7, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s5, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s6, s3, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s6, s3 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s3, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s3, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s2, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s6, s5, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, s5 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s5, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s7, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s6, s2, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, s2 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s2, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s7, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s2, s2, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, s6 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s6, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s8, s7, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s8, s7 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s7, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s6, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s45, s17, s56 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s8, s7, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s8, s7 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s7, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s8, s7, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, s7 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s7, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s7, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s44, s16, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s9, s8 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s7, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s9, s8 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s19, s7, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s31, s19, s57 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s9, s8 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, s8 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s21, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s8, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s30, s18, s7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s9, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s10, s9 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s8, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s9, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s10, s9 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s20, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s8, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s93, s21, s58 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s9, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s10, s9 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s9, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, s9 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s23, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s9, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s92, s20, s8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s11, s10, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s11, s10 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s10, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s9, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s11, s10, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s11, s10 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s10, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s9, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s89, s23, s59 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s11, s10, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s11, s10 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s10, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s11, s10, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s11, s11, s10 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s10, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s11, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s25, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s10, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s88, s22, s9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s11, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s12, s11 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s10, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s11, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s12, s11 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s10, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s77, s25, s60 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s11, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s12, s11 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s10, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s11, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s12, s11 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s27, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s10, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s76, s24, s13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s11, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s12, s11 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s10, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s11, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s12, s11 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s12, s11, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s26, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s12, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s2, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s3, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s51, s27, s61 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s4, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s4 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s4, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s4, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s6, s5, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s6, s5 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s5, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s26, s5, s4 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[76:77], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[88:89], 24 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s50, s26, s48 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[92:93], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[30:31], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[44:45], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[42:43], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s51, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s51, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s77, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s77, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s76, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s76, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s89, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s89, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s88, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s88, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s93, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s93, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s92, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s92, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s31, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s31, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s30, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s30, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s45, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s45, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s44, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s44, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s11, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s10, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s43, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s43, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s42, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[50:51], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s50, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s50, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s42, 8 +; GFX11-FAKE16-NEXT: .LBB109_3: ; %end +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s42, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s48, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s40, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s39, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s46, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s38, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s37, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s36, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s28, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s35, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s47, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s34, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s31, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s30, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s14, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s45, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s56, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, vcc_hi, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s95, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s94, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s12, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s19, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s93, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s57, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s92, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s91, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s90, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s89, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s58, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s88, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s79, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s78, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s77, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s59, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s76, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s75, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s74, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s4, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s25, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s73, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s60, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s72, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s44, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s43, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s10, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s27, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s63, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s61, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s62, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v17, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v17, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v17, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v17, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v17, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v17, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v17, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v17, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v17, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v17, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v17, 0 +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v17, off, s32 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB109_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: s_branch .LBB109_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i8_to_v32bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v7 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v11 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v13 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v15 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v19 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v21 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v23 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v27 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v32 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v30 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v26 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v10 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_lshlrev_b32_e32 v26, 8, v3 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v17 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v13 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 8, v11 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v7 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v3, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v12 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v4, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v5, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v5, v2 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v9 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 24, v61 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v60 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v56 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v44 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v45 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v63 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v46 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v58 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v57 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 24, v17 -; GCN-NEXT: v_and_b32_e32 v52, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v34 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v35, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v38 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v39, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v50 -; GCN-NEXT: v_or_b32_e32 v26, v51, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v52 -; GCN-NEXT: v_or_b32_e32 v30, v53, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v54 -; GCN-NEXT: v_or_b32_e32 v43, v5, v55 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v5, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v3, v41 -; GCN-NEXT: v_or_b32_e32 v33, v0, v21 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v0, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v0, v7 -; GCN-NEXT: v_or_b32_e32 v51, v1, v19 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v55, v0, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v11 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v0, v13 -; GCN-NEXT: v_or_b32_e32 v32, v4, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v0, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v0, v14 -; GCN-NEXT: v_or_b32_e32 v36, v8, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v37, v0, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v0, v27 -; GCN-NEXT: v_or_b32_e32 v48, v9, v28 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v0, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v31 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v0, v42 -; GCN-NEXT: v_or_b32_e32 v52, v2, v44 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v0, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v26 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v0, v25 -; GCN-NEXT: v_or_b32_e32 v40, v10, v24 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v0, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v30 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v0, v47 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: .LBB55_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v30, v3 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v17 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v7, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v26, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v58 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v18 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v45 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v61 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v9 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v36 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v38 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v52, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v29 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v36 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v39 -; GCN-NEXT: v_or_b32_e32 v14, v14, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v49 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v32, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v51 -; GCN-NEXT: v_or_b32_e32 v13, v13, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v53 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v35, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v11, v11, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v2, v2, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v17, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; GCN-NEXT: v_or_b32_e32 v8, v8, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; GCN-NEXT: v_or_b32_e32 v0, v0, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v24 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v22, v9 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v22, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v22, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v22, v1 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s7, v29 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v30 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v31 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v32 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v30, v33 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v31, v34 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v32, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s7, v28 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v32, v15 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v32, v16 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v32, v17 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v3, v9, v3 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_or_b32_e32 v4, v4, v7 -; GCN-NEXT: v_or_b32_e32 v1, v1, v10 -; GCN-NEXT: v_or_b32_e32 v6, v23, v22 -; GCN-NEXT: v_or_b32_e32 v7, v24, v14 -; GCN-NEXT: v_or_b32_e32 v9, v30, v29 -; GCN-NEXT: v_or_b32_e32 v10, v31, v13 -; GCN-NEXT: v_or_b32_e32 v13, v26, v25 -; GCN-NEXT: v_or_b32_e32 v11, v27, v11 -; GCN-NEXT: v_or_b32_e32 v14, v15, v28 -; GCN-NEXT: v_or_b32_e32 v2, v16, v2 -; GCN-NEXT: v_or_b32_e32 v12, v17, v12 -; GCN-NEXT: v_or_b32_e32 v8, v18, v8 -; GCN-NEXT: v_or_b32_e32 v15, v20, v19 -; GCN-NEXT: v_or_b32_e32 v0, v21, v0 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v11 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v19 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v10 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v9 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v18 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v1 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v17 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v16 -; GCN-NEXT: .LBB55_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v43 -; GCN-NEXT: v_mov_b32_e32 v1, v35 -; GCN-NEXT: v_mov_b32_e32 v2, v49 -; GCN-NEXT: v_mov_b32_e32 v4, v33 -; GCN-NEXT: v_mov_b32_e32 v6, v39 -; GCN-NEXT: v_mov_b32_e32 v8, v51 -; GCN-NEXT: v_mov_b32_e32 v9, v55 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mov_b32_e32 v10, v41 -; GCN-NEXT: v_mov_b32_e32 v12, v32 -; GCN-NEXT: v_mov_b32_e32 v14, v34 -; GCN-NEXT: v_mov_b32_e32 v16, v36 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mov_b32_e32 v17, v37 -; GCN-NEXT: v_mov_b32_e32 v18, v38 -; GCN-NEXT: v_mov_b32_e32 v20, v48 -; GCN-NEXT: v_mov_b32_e32 v22, v50 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mov_b32_e32 v24, v52 -; GCN-NEXT: v_mov_b32_e32 v25, v53 -; GCN-NEXT: v_mov_b32_e32 v26, v54 -; GCN-NEXT: v_mov_b32_e32 v28, v40 -; GCN-NEXT: v_mov_b32_e32 v30, v42 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i8_to_v32bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v27 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v16 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v24 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v25 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v31 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v32 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v33 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v34 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v35 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v3 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v3, v5 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_or_b32_e32 v33, v7, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v63 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v51, v13, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v55, v13, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v11, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: v_or_b32_e32 v32, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v13, v10, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v15, v14, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v36, v1, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v37, v26, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v19, v18, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v48, v1, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v57 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v21, v30, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v23, v25, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v52, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v53, v16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v27, v45, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v49 +; SI-NEXT: v_or_b32_e32 v40, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v29, v56, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v31, v61, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: .LBB110_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v28 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v49 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v9 +; SI-NEXT: v_or_b32_e32 v3, v47, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v45, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v57 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v3, v12, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v25, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v62 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v4 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v28 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v20 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v17 +; SI-NEXT: v_or_b32_e32 v3, v8, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v8 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v26, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v14, v3 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v0 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_or_b32_e32 v3, v63, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v13, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v10 +; SI-NEXT: .LBB110_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v2, v43 +; SI-NEXT: v_mov_b32_e32 v10, v41 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v28, v40 +; SI-NEXT: v_mov_b32_e32 v30, v42 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v4, v33 +; SI-NEXT: v_mov_b32_e32 v6, v39 +; SI-NEXT: v_mov_b32_e32 v8, v51 +; SI-NEXT: v_mov_b32_e32 v9, v55 +; SI-NEXT: v_mov_b32_e32 v12, v32 +; SI-NEXT: v_mov_b32_e32 v14, v34 +; SI-NEXT: v_mov_b32_e32 v16, v36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v17, v37 +; SI-NEXT: v_mov_b32_e32 v18, v38 +; SI-NEXT: v_mov_b32_e32 v20, v48 +; SI-NEXT: v_mov_b32_e32 v22, v50 +; SI-NEXT: v_mov_b32_e32 v24, v52 +; SI-NEXT: v_mov_b32_e32 v25, v53 +; SI-NEXT: v_mov_b32_e32 v26, v54 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v32bf16: ; VI: ; %bb.0: @@ -47118,7 +94550,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB55_2 +; VI-NEXT: s_cbranch_execz .LBB110_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -47281,9 +94713,9 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: .LBB55_2: ; %Flow +; VI-NEXT: .LBB110_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB55_4 +; VI-NEXT: s_cbranch_execz .LBB110_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_add_u16_e32 v0, 3, v38 @@ -47438,7 +94870,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: v_add_u16_e32 v3, 0x300, v23 ; VI-NEXT: v_or_b32_e32 v3, v3, v19 -; VI-NEXT: .LBB55_4: ; %end +; VI-NEXT: .LBB110_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -47582,7 +95014,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB55_2 +; GFX9-NEXT: s_cbranch_execz .LBB110_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -47747,9 +95179,9 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: .LBB55_2: ; %Flow +; GFX9-NEXT: .LBB110_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB55_4 +; GFX9-NEXT: s_cbranch_execz .LBB110_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 @@ -47902,7 +95334,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_perm_b32 v0, v31, v0, s6 -; GFX9-NEXT: .LBB55_4: ; %end +; GFX9-NEXT: .LBB110_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -48033,15 +95465,15 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_4 -; GFX11-TRUE16-NEXT: .LBB55_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_4 +; GFX11-TRUE16-NEXT: .LBB110_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB55_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.l @@ -48172,8 +95604,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB55_2 -; GFX11-TRUE16-NEXT: .LBB55_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 +; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 @@ -48403,15 +95835,15 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB110_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_4 -; GFX11-FAKE16-NEXT: .LBB55_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB110_4 +; GFX11-FAKE16-NEXT: .LBB110_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB55_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v38 @@ -48558,8 +95990,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB55_2 -; GFX11-FAKE16-NEXT: .LBB55_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB110_2 +; GFX11-FAKE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v70, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3 @@ -48728,3 +96160,2077 @@ end: %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <32 x bfloat> %phi } + +define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i8_to_v32bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; SI-NEXT: v_readfirstlane_b32 s46, v30 +; SI-NEXT: v_readfirstlane_b32 s44, v23 +; SI-NEXT: v_readfirstlane_b32 s45, v22 +; SI-NEXT: v_readfirstlane_b32 s41, v15 +; SI-NEXT: v_readfirstlane_b32 s43, v14 +; SI-NEXT: v_readfirstlane_b32 s10, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s58, v31 +; SI-NEXT: v_readfirstlane_b32 s59, v32 +; SI-NEXT: v_readfirstlane_b32 s56, v33 +; SI-NEXT: v_readfirstlane_b32 s57, v34 +; SI-NEXT: v_readfirstlane_b32 s47, v35 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v37 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v38 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v48 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v45 +; SI-NEXT: s_cbranch_scc0 .LBB111_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s17, 24 +; SI-NEXT: s_or_b32 s8, s5, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s9, s5, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s11, s4, 16 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_or_b32 s13, s5, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s25, 24 +; SI-NEXT: s_or_b32 s14, s5, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_or_b32 s15, s5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s40, s4, 16 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s7, 24 +; SI-NEXT: s_or_b32 s42, s5, s4 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s60, s4, 16 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s41, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v2 +; SI-NEXT: s_lshl_b32 s61, s4, 16 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v19 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v51 +; SI-NEXT: v_or_b32_e32 v37, v13, v9 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v39, v21, v17 +; SI-NEXT: s_lshl_b32 s62, s4, 16 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v24 +; SI-NEXT: s_and_b32 s4, s46, 0xff +; SI-NEXT: s_lshl_b32 s5, s47, 8 +; SI-NEXT: v_or_b32_e32 v32, v29, v25 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v38, v1, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v33, v14, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v26 +; SI-NEXT: s_lshl_b32 s63, s4, 16 +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_lshl_b32 s5, s56, 8 +; SI-NEXT: v_or_b32_e32 v34, v42, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v27 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v54 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v48, v15, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v36, v23, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v49 +; SI-NEXT: s_lshl_b32 s72, s4, 16 +; SI-NEXT: v_or_b32_e32 v35, v31, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v53 +; SI-NEXT: s_and_b32 s4, s59, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v9, v0, v9 +; SI-NEXT: v_or_b32_e32 v13, v5, v13 +; SI-NEXT: v_or_b32_e32 v15, v6, v15 +; SI-NEXT: v_or_b32_e32 v17, v7, v17 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v23, v30, v23 +; SI-NEXT: v_or_b32_e32 v25, v41, v25 +; SI-NEXT: v_or_b32_e32 v29, v44, v29 +; SI-NEXT: s_lshl_b32 s73, s4, 16 +; SI-NEXT: v_or_b32_e32 v31, v45, v31 +; SI-NEXT: s_cbranch_execnz .LBB111_4 +; SI-NEXT: .LBB111_2: ; %cmp.true +; SI-NEXT: s_add_i32 s59, s59, 3 +; SI-NEXT: s_and_b32 s4, s59, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v43 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v9, v45, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: v_add_i32_e32 v43, vcc, 0x3000000, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v54 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v13, v44, v13 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: s_add_i32 s57, s57, 3 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v9 +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_lshl_b32 s5, s56, 8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v40 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v9, v42, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v51 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v13, v41, v13 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: v_add_i32_e32 v32, vcc, 0x3000000, v9 +; SI-NEXT: s_and_b32 s4, s46, 0xff +; SI-NEXT: s_lshl_b32 s5, s47, 8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v50 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v9, v30, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v27 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v13, v22, v13 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v9 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v24 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v9, v14, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v19 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v13 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s41, 8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s7, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s22, 0xff +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s16, 0xff +; SI-NEXT: s_lshl_b32 s8, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s9, s18, 0xff +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s8, s19, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s7, 16 +; SI-NEXT: s_and_b32 s13, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s6, 16 +; SI-NEXT: s_and_b32 s15, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s5, 16 +; SI-NEXT: s_and_b32 s42, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s4, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v0 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v43 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v43 +; SI-NEXT: s_branch .LBB111_5 +; SI-NEXT: .LBB111_3: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB111_2 +; SI-NEXT: .LBB111_4: +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v14, s61 +; SI-NEXT: v_mov_b32_e32 v18, s62 +; SI-NEXT: v_mov_b32_e32 v22, s63 +; SI-NEXT: v_mov_b32_e32 v26, s72 +; SI-NEXT: v_mov_b32_e32 v30, s73 +; SI-NEXT: .LBB111_5: ; %end +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v3, s13 +; SI-NEXT: v_mov_b32_e32 v4, s14 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s40 +; SI-NEXT: v_mov_b32_e32 v7, s42 +; SI-NEXT: v_mov_b32_e32 v8, v37 +; SI-NEXT: v_mov_b32_e32 v11, v38 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v16, v39 +; SI-NEXT: v_mov_b32_e32 v19, v33 +; SI-NEXT: v_mov_b32_e32 v20, v36 +; SI-NEXT: v_mov_b32_e32 v24, v32 +; SI-NEXT: v_mov_b32_e32 v27, v34 +; SI-NEXT: v_mov_b32_e32 v28, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i8_to_v32bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v44, 8, v27 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v32 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v34 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB111_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v10, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v26, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v13 +; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v62 +; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v53, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v63 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_mov_b32_e32 v19, v57 +; VI-NEXT: v_mov_b32_e32 v57, v15 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: v_mov_b32_e32 v42, v17 +; VI-NEXT: v_mov_b32_e32 v35, v16 +; VI-NEXT: v_mov_b32_e32 v41, v21 +; VI-NEXT: v_mov_b32_e32 v38, v20 +; VI-NEXT: v_mov_b32_e32 v39, v25 +; VI-NEXT: v_mov_b32_e32 v34, v24 +; VI-NEXT: v_mov_b32_e32 v32, v29 +; VI-NEXT: v_mov_b32_e32 v31, v28 +; VI-NEXT: v_mov_b32_e32 v23, v50 +; VI-NEXT: v_mov_b32_e32 v50, v30 +; VI-NEXT: v_mov_b32_e32 v27, v49 +; VI-NEXT: v_mov_b32_e32 v49, v55 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v59, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v58 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB111_3 +; VI-NEXT: .LBB111_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v19 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v37 +; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v50 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v10, 24, v32 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_e32 v6, 24, v36 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v56 +; VI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v43 +; VI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v38 +; VI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; VI-NEXT: v_lshlrev_b32_e32 v7, 24, v42 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v34 +; VI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; VI-NEXT: v_lshlrev_b32_e32 v8, 24, v41 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v31 +; VI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; VI-NEXT: v_lshlrev_b32_e32 v9, 24, v39 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s10, s29, 8 +; VI-NEXT: s_and_b32 s11, s28, 0xff +; VI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; VI-NEXT: v_lshlrev_b32_e32 v11, 24, v52 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s25, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s11, s24, 0xff +; VI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; VI-NEXT: v_lshlrev_b32_e32 v12, 24, v61 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v48 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s11, s20, 0xff +; VI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v51 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: v_or_b32_sdwa v38, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s8, s11 +; VI-NEXT: s_and_b32 s11, s16, 0xff +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: s_or_b32 s7, s7, s11 +; VI-NEXT: s_and_b32 s13, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 24 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v53 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v62 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v39, v49, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_and_b32 s12, s22, 0xff +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v19 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 +; VI-NEXT: s_and_b32 s11, s26, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s8, 0xffff +; VI-NEXT: s_lshl_b32 s8, s12, 16 +; VI-NEXT: s_lshl_b32 s4, s27, 24 +; VI-NEXT: v_add_u32_e32 v48, vcc, 3, v59 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s8, s11, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v63 +; VI-NEXT: v_or_b32_sdwa v48, v54, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s9, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s10, 0xffff +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v37, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v19, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v5, 24, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v30 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v26, v19, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v4, 24, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v58 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; VI-NEXT: v_or_b32_e32 v3, v3, v13 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v22 +; VI-NEXT: v_or_b32_e32 v3, s7, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v23, v23, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v27, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v23 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x300, v27 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_and_b32_e32 v33, 0xff, v34 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; VI-NEXT: v_or_b32_e32 v4, v4, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; VI-NEXT: v_or_b32_e32 v5, v5, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; VI-NEXT: v_or_b32_e32 v6, v6, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 +; VI-NEXT: v_or_b32_e32 v7, v7, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; VI-NEXT: v_or_b32_e32 v8, v8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; VI-NEXT: v_or_b32_e32 v9, v9, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 +; VI-NEXT: v_or_b32_e32 v10, v10, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 +; VI-NEXT: v_or_b32_e32 v11, v11, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; VI-NEXT: v_or_b32_e32 v2, v2, v13 +; VI-NEXT: v_or_b32_sdwa v2, v2, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_sdwa v1, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v18 +; VI-NEXT: v_or_b32_sdwa v5, v5, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v12, v12, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_or_b32_sdwa v0, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v19 +; VI-NEXT: v_or_b32_sdwa v7, v7, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: .LBB111_3: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB111_4: +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v23, v50 +; VI-NEXT: v_mov_b32_e32 v19, v57 +; VI-NEXT: v_mov_b32_e32 v27, v49 +; VI-NEXT: v_mov_b32_e32 v36, v13 +; VI-NEXT: v_mov_b32_e32 v42, v17 +; VI-NEXT: v_mov_b32_e32 v41, v21 +; VI-NEXT: v_mov_b32_e32 v39, v25 +; VI-NEXT: v_mov_b32_e32 v32, v29 +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_mov_b32_e32 v35, v16 +; VI-NEXT: v_mov_b32_e32 v38, v20 +; VI-NEXT: v_mov_b32_e32 v34, v24 +; VI-NEXT: v_mov_b32_e32 v31, v28 +; VI-NEXT: v_mov_b32_e32 v50, v30 +; VI-NEXT: v_mov_b32_e32 v49, v55 +; VI-NEXT: v_mov_b32_e32 v57, v15 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB111_2 +; +; GFX9-LABEL: bitcast_v64i8_to_v32bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v34, v30 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_mov_b32_e32 v51, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v51 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v32 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v38 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v31 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v33 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v35 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v49 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v16, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v37, v24 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: v_mov_b32_e32 v55, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v19, v13 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: v_mov_b32_e32 v42, v15 +; GFX9-NEXT: v_mov_b32_e32 v27, v25 +; GFX9-NEXT: v_mov_b32_e32 v30, v18 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: v_mov_b32_e32 v39, v26 +; GFX9-NEXT: v_mov_b32_e32 v35, v28 +; GFX9-NEXT: v_mov_b32_e32 v54, v31 +; GFX9-NEXT: v_mov_b32_e32 v31, v51 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v18, v22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v20, v24 +; GFX9-NEXT: s_cbranch_execnz .LBB111_3 +; GFX9-NEXT: .LBB111_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v13, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v15, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s10, s11, s10 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v16, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v21, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v36, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v22 +; GFX9-NEXT: .LBB111_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB111_4: +; GFX9-NEXT: v_mov_b32_e32 v30, v18 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v54, v31 +; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_mov_b32_e32 v39, v26 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, v22 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v35, v28 +; GFX9-NEXT: v_mov_b32_e32 v37, v24 +; GFX9-NEXT: v_mov_b32_e32 v31, v51 +; GFX9-NEXT: v_mov_b32_e32 v27, v25 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v42, v15 +; GFX9-NEXT: v_mov_b32_e32 v19, v13 +; GFX9-NEXT: v_mov_b32_e32 v55, v11 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB111_2 +; +; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32bf16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, v96, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v86, 16, v87 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v96, 16, v97 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX11-TRUE16-NEXT: .LBB111_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v28, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v20, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v18, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v23, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v25, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v37, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v2, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: .LBB111_3: ; %end +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB111_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB111_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v32bf16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v35 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v39 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v38 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v31 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v5, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v6, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v24 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX11-FAKE16-NEXT: .LBB111_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v68 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v64 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v16 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v71, v5 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v66, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v28 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v29, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v27, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v25, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v21, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v55, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v54, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v17, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v53, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v33 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v52, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v51, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v50, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v49, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v35 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v80 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v83, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v48, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v39, v4 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v85, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB111_3: ; %end +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB111_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB111_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll index 8ae7b58330256..5624a08cd89fc 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -1,42 +1,42 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <18 x float> @bitcast_v18i32_to_v18f32(<18 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v18i32_to_v18f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18i32_to_v18f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i32_to_v18f32: ; VI: ; %bb.0: @@ -148,37 +148,251 @@ end: ret <18 x float> %phi } +define inreg <18 x float> @bitcast_v18i32_to_v18f32_scalar(<18 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18i32_to_v18f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v18i32_to_v18f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v18i32_to_v18f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v18i32_to_v18f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s29, s29, 3 +; GFX11-NEXT: s_add_i32 s28, s28, 3 +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <18 x i32> %a, splat (i32 3) + %a2 = bitcast <18 x i32> %a1 to <18 x float> + br label %end + +cmp.false: + %a3 = bitcast <18 x i32> %a to <18 x float> + br label %end + +end: + %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x float> %phi +} + define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v18f32_to_v18i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18f32_to_v18i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f32_to_v18i32: ; VI: ; %bb.0: @@ -187,7 +401,7 @@ define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 @@ -207,7 +421,7 @@ define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -218,7 +432,7 @@ define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 @@ -238,7 +452,7 @@ define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -250,7 +464,7 @@ define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 @@ -261,7 +475,7 @@ define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -281,37 +495,304 @@ end: ret <18 x i32> %phi } +define inreg <18 x i32> @bitcast_v18f32_to_v18i32_scalar(<18 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18f32_to_v18i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v18f32_to_v18i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v18f32_to_v18i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v18f32_to_v18i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s47, s23 +; GFX11-NEXT: s_mov_b32 s46, s22 +; GFX11-NEXT: s_mov_b32 s45, s21 +; GFX11-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-NEXT: s_mov_b32 s44, s20 +; GFX11-NEXT: s_mov_b32 s43, s19 +; GFX11-NEXT: s_mov_b32 s42, s18 +; GFX11-NEXT: s_mov_b32 s41, s17 +; GFX11-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-NEXT: s_mov_b32 s40, s16 +; GFX11-NEXT: s_mov_b32 s38, s2 +; GFX11-NEXT: s_mov_b32 s37, s1 +; GFX11-NEXT: s_mov_b32 s36, s0 +; GFX11-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-NEXT: s_mov_b32 s39, s3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-NEXT: s_mov_b32 s48, s24 +; GFX11-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-NEXT: s_mov_b32 s49, s25 +; GFX11-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-NEXT: s_mov_b32 s50, s26 +; GFX11-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-NEXT: s_mov_b32 s51, s27 +; GFX11-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-NEXT: s_mov_b32 s52, s28 +; GFX11-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-NEXT: s_mov_b32 s53, s29 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v17, s53, 1.0 +; GFX11-NEXT: v_add_f32_e64 v16, s52, 1.0 +; GFX11-NEXT: v_add_f32_e64 v15, s51, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s50, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s49, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s48, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s47, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s46, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s45, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s44, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s43, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s42, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s41, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s40, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s39, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s38, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s37, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s36, 1.0 +; GFX11-NEXT: s_branch .LBB3_5 +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-NEXT: .LBB3_5: ; %end +; GFX11-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <18 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <18 x float> %a1 to <18 x i32> + br label %end + +cmp.false: + %a3 = bitcast <18 x float> %a to <18 x i32> + br label %end + +end: + %phi = phi <18 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x i32> %phi +} + define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v18i32_to_v9i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18i32_to_v9i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i32_to_v9i64: ; VI: ; %bb.0: @@ -320,7 +801,7 @@ define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 @@ -340,7 +821,7 @@ define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -351,7 +832,7 @@ define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 ; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 @@ -371,7 +852,7 @@ define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -383,7 +864,7 @@ define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 ; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 @@ -403,7 +884,7 @@ define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -423,37 +904,251 @@ end: ret <9 x i64> %phi } +define inreg <9 x i64> @bitcast_v18i32_to_v9i64_scalar(<18 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18i32_to_v9i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v18i32_to_v9i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v18i32_to_v9i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v18i32_to_v9i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s29, s29, 3 +; GFX11-NEXT: s_add_i32 s28, s28, 3 +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <18 x i32> %a, splat (i32 3) + %a2 = bitcast <18 x i32> %a1 to <9 x i64> + br label %end + +cmp.false: + %a3 = bitcast <18 x i32> %a to <9 x i64> + br label %end + +end: + %phi = phi <9 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i64> %phi +} + define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v9i64_to_v18i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9i64_to_v18i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i64_to_v18i32: ; VI: ; %bb.0: @@ -462,7 +1157,7 @@ define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 ; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc @@ -482,7 +1177,7 @@ define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -493,7 +1188,7 @@ define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc @@ -513,7 +1208,7 @@ define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -525,7 +1220,7 @@ define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -550,7 +1245,7 @@ define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -570,77 +1265,291 @@ end: ret <18 x i32> %phi } -define <9 x double> @bitcast_v18i32_to_v9f64(<18 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v18i32_to_v9f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <18 x i32> @bitcast_v9i64_to_v18i32_scalar(<9 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9i64_to_v18i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 ; -; VI-LABEL: bitcast_v18i32_to_v9f64: +; VI-LABEL: bitcast_v9i64_to_v18i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB7_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 ; -; GFX9-LABEL: bitcast_v18i32_to_v9f64: +; GFX9-LABEL: bitcast_v9i64_to_v18i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v9i64_to_v18i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_3 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s28, s28, 3 +; GFX11-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB7_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: s_branch .LBB7_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i64> %a, splat (i64 3) + %a2 = bitcast <9 x i64> %a1 to <18 x i32> + br label %end + +cmp.false: + %a3 = bitcast <9 x i64> %a to <18 x i32> + br label %end + +end: + %phi = phi <18 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x i32> %phi +} + +define <9 x double> @bitcast_v18i32_to_v9f64(<18 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v18i32_to_v9f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v18i32_to_v9f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v18i32_to_v9f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 ; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 @@ -660,7 +1569,7 @@ define <9 x double> @bitcast_v18i32_to_v9f64(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -672,7 +1581,7 @@ define <9 x double> @bitcast_v18i32_to_v9f64(<18 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 ; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 @@ -692,7 +1601,7 @@ define <9 x double> @bitcast_v18i32_to_v9f64(<18 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -712,28 +1621,242 @@ end: ret <9 x double> %phi } +define inreg <9 x double> @bitcast_v18i32_to_v9f64_scalar(<18 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18i32_to_v9f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v18i32_to_v9f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v18i32_to_v9f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v18i32_to_v9f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s29, s29, 3 +; GFX11-NEXT: s_add_i32 s28, s28, 3 +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <18 x i32> %a, splat (i32 3) + %a2 = bitcast <18 x i32> %a1 to <9 x double> + br label %end + +cmp.false: + %a3 = bitcast <18 x i32> %a to <9 x double> + br label %end + +end: + %phi = phi <9 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x double> %phi +} + define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v9f64_to_v18i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9f64_to_v18i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f64_to_v18i32: ; VI: ; %bb.0: @@ -742,7 +1865,7 @@ define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -753,7 +1876,7 @@ define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -764,7 +1887,7 @@ define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -775,7 +1898,7 @@ define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -787,7 +1910,7 @@ define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -798,7 +1921,7 @@ define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -818,184 +1941,432 @@ end: ret <18 x i32> %phi } +define inreg <18 x i32> @bitcast_v9f64_to_v18i32_scalar(<9 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9f64_to_v18i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v9f64_to_v18i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v9f64_to_v18i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v9f64_to_v18i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s47, s23 +; GFX11-NEXT: s_mov_b32 s46, s22 +; GFX11-NEXT: s_mov_b32 s45, s21 +; GFX11-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-NEXT: s_mov_b32 s44, s20 +; GFX11-NEXT: s_mov_b32 s43, s19 +; GFX11-NEXT: s_mov_b32 s42, s18 +; GFX11-NEXT: s_mov_b32 s41, s17 +; GFX11-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-NEXT: s_mov_b32 s40, s16 +; GFX11-NEXT: s_mov_b32 s38, s2 +; GFX11-NEXT: s_mov_b32 s37, s1 +; GFX11-NEXT: s_mov_b32 s36, s0 +; GFX11-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-NEXT: s_mov_b32 s39, s3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-NEXT: s_mov_b32 s48, s24 +; GFX11-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-NEXT: s_mov_b32 s49, s25 +; GFX11-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-NEXT: s_mov_b32 s50, s26 +; GFX11-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-NEXT: s_mov_b32 s51, s27 +; GFX11-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-NEXT: s_mov_b32 s52, s28 +; GFX11-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-NEXT: s_mov_b32 s53, s29 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[16:17], s[52:53], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], s[50:51], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[48:49], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[46:47], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[44:45], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[42:43], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[40:41], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[38:39], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[36:37], 1.0 +; GFX11-NEXT: s_branch .LBB11_5 +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-NEXT: .LBB11_5: ; %end +; GFX11-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <9 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <9 x double> %a1 to <18 x i32> + br label %end + +cmp.false: + %a3 = bitcast <9 x double> %a to <18 x i32> + br label %end + +end: + %phi = phi <18 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x i32> %phi +} + define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v18i32_to_v36i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v1, v1, v33 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v36 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v3, v3, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v4, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v6, v6, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v7, v7, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v8, v8, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v9, v9, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v10, v10, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v12, v12, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v13, v13, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v14, v14, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v15, v15, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v16, v16, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v17, v17, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v18, v18, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v23, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18i32_to_v36i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v33 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i32_to_v36i16: ; VI: ; %bb.0: @@ -1021,7 +2392,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -1041,9 +2412,9 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB6_2: ; %Flow +; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_4 +; VI-NEXT: s_cbranch_execz .LBB12_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 @@ -1081,7 +2452,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB6_4: ; %end +; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 @@ -1145,7 +2516,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -1165,9 +2536,9 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB6_2: ; %Flow +; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 ; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 @@ -1205,7 +2576,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB6_4: ; %end +; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 @@ -1236,7 +2607,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 @@ -1256,7 +2627,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: .LBB12_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1285,7 +2656,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -1305,9 +2676,9 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 @@ -1345,7 +2716,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 @@ -1384,269 +2755,1030 @@ end: ret <36 x i16> %phi } +define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18i32_to_v36i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s28 +; SI-NEXT: v_mov_b32_e32 v4, s26 +; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s29, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s27, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s25, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s21, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s19, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s17, v9, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 16 +; SI-NEXT: s_lshr_b32 s11, s8, 16 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s27, 16 +; SI-NEXT: s_lshr_b32 s14, s25, 16 +; SI-NEXT: s_lshr_b32 s15, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s28 +; SI-NEXT: v_mov_b32_e32 v4, s26 +; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s29, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s27, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s25, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s21, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s19, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s17, v9, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 16 +; SI-NEXT: s_lshr_b32 s11, s8, 16 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s27, 16 +; SI-NEXT: s_lshr_b32 s14, s25, 16 +; SI-NEXT: s_lshr_b32 s15, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v18i32_to_v36i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_readfirstlane_b32 s9, v0 +; VI-NEXT: v_readfirstlane_b32 s8, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s8, 16 +; VI-NEXT: s_lshr_b32 s13, s9, 16 +; VI-NEXT: s_lshr_b32 s14, s29, 16 +; VI-NEXT: s_lshr_b32 s15, s28, 16 +; VI-NEXT: s_lshr_b32 s40, s27, 16 +; VI-NEXT: s_lshr_b32 s41, s26, 16 +; VI-NEXT: s_lshr_b32 s42, s25, 16 +; VI-NEXT: s_lshr_b32 s43, s24, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s8, 16 +; VI-NEXT: s_lshr_b32 s13, s9, 16 +; VI-NEXT: s_lshr_b32 s14, s29, 16 +; VI-NEXT: s_lshr_b32 s15, s28, 16 +; VI-NEXT: s_lshr_b32 s40, s27, 16 +; VI-NEXT: s_lshr_b32 s41, s26, 16 +; VI-NEXT: s_lshr_b32 s42, s25, 16 +; VI-NEXT: s_lshr_b32 s43, s24, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s59, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s58, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s57, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s56, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s47, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s46, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s45, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s44, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s43, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s42, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s41, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s40, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s15, s26, s15 +; VI-NEXT: s_and_b32 s26, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s14, s26, s14 +; VI-NEXT: s_or_b32 s9, s9, s13 +; VI-NEXT: s_or_b32 s8, s8, s12 +; VI-NEXT: s_or_b32 s6, s6, s11 +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s15 +; VI-NEXT: v_mov_b32_e32 v13, s14 +; VI-NEXT: v_mov_b32_e32 v14, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v18i32_to_v36i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_lshr_b32 s12, s7, 16 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshr_b32 s14, s29, 16 +; GFX9-NEXT: s_lshr_b32 s15, s28, 16 +; GFX9-NEXT: s_lshr_b32 s40, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s42, s25, 16 +; GFX9-NEXT: s_lshr_b32 s43, s24, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s56, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_lshr_b32 s12, s7, 16 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshr_b32 s14, s29, 16 +; GFX9-NEXT: s_lshr_b32 s15, s28, 16 +; GFX9-NEXT: s_lshr_b32 s40, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s42, s25, 16 +; GFX9-NEXT: s_lshr_b32 s43, s24, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s56, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s29, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s15 +; GFX9-NEXT: v_mov_b32_e32 v13, s14 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-TRUE16-LABEL: bitcast_v18i32_to_v36i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s18, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s19, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s18, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s19, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s18, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s19, s4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s15 :: v_dual_mov_b32 v7, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v9, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 +; +; GFX11-FAKE16-LABEL: bitcast_v18i32_to_v36i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s20, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s24, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s25, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s26, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s27, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s28, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s29, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s15 :: v_dual_mov_b32 v7, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB13_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <18 x i32> %a, splat (i32 3) + %a2 = bitcast <18 x i32> %a1 to <36 x i16> + br label %end + +cmp.false: + %a3 = bitcast <18 x i32> %a to <36 x i16> + br label %end + +end: + %phi = phi <36 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x i16> %phi +} + define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v36i16_to_v18i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v40 -; GCN-NEXT: v_or_b32_e32 v1, v1, v41 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v18 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: v_or_b32_e32 v8, v8, v44 -; GCN-NEXT: v_or_b32_e32 v9, v9, v45 -; GCN-NEXT: v_or_b32_e32 v10, v10, v46 -; GCN-NEXT: v_or_b32_e32 v11, v11, v47 -; GCN-NEXT: v_or_b32_e32 v12, v12, v56 -; GCN-NEXT: v_or_b32_e32 v13, v13, v57 -; GCN-NEXT: v_or_b32_e32 v14, v14, v58 -; GCN-NEXT: v_or_b32_e32 v15, v15, v59 -; GCN-NEXT: v_or_b32_e32 v16, v16, v60 -; GCN-NEXT: v_or_b32_e32 v17, v17, v61 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: .LBB7_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v0, v40, v0 -; GCN-NEXT: v_or_b32_e32 v1, v41, v1 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v18, v4 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v18, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: v_or_b32_e32 v8, v44, v8 -; GCN-NEXT: v_or_b32_e32 v9, v45, v9 -; GCN-NEXT: v_or_b32_e32 v10, v46, v10 -; GCN-NEXT: v_or_b32_e32 v11, v47, v11 -; GCN-NEXT: v_or_b32_e32 v12, v56, v12 -; GCN-NEXT: v_or_b32_e32 v13, v57, v13 -; GCN-NEXT: v_or_b32_e32 v14, v58, v14 -; GCN-NEXT: v_or_b32_e32 v15, v59, v15 -; GCN-NEXT: v_or_b32_e32 v16, v60, v16 -; GCN-NEXT: v_or_b32_e32 v17, v61, v17 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: .LBB7_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36i16_to_v18i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v36, v22 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v40 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v62 +; SI-NEXT: v_or_b32_e32 v6, v6, v61 +; SI-NEXT: v_or_b32_e32 v7, v7, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v59 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v43 +; SI-NEXT: v_or_b32_e32 v17, v17, v42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v12, v12, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v46 +; SI-NEXT: v_or_b32_e32 v14, v14, v45 +; SI-NEXT: v_or_b32_e32 v15, v15, v44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v63, v4 +; SI-NEXT: v_or_b32_e32 v5, v62, v5 +; SI-NEXT: v_or_b32_e32 v6, v61, v6 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_or_b32_e32 v8, v59, v8 +; SI-NEXT: v_or_b32_e32 v9, v58, v9 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v11, v56, v11 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_or_b32_e32 v17, v42, v17 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v13, v46, v13 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v44, v15 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36i16_to_v18i32: ; VI: ; %bb.0: @@ -1675,7 +3807,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v17, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v17, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1732,9 +3864,9 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_4 +; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v41 @@ -1791,7 +3923,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v18, 3, v32 ; VI-NEXT: v_add_u16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: .LBB7_4: ; %end +; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -1861,7 +3993,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; kill: killed $vgpr18 @@ -1926,9 +4058,9 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -1975,7 +4107,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_4: ; %end +; GFX9-NEXT: .LBB14_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2004,7 +4136,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2024,7 +4156,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2072,7 +4204,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2092,7 +4224,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2112,337 +4244,1115 @@ end: ret <18 x i32> %phi } +define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36i16_to_v18i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v20 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v34, v16 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v8 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v4 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v21 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v7, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v8, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v16, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v17, v0, v51 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v36i16_to_v18i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v32, v3 +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v36i16_to_v18i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v3 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_mov_b32_e32 v35, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v18i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s54, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s55, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s64, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s65, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s66, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s67, 15 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s36, s40, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s37, s41, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s48, s56, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s49, s57, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s38, s40, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s39, s41, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s50, s56, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s51, s57, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s40, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s41, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s42, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s43, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s44, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s45, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s46, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s52, s56, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s53, s57, s4 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s74 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_branch .LBB15_5 +; GFX11-TRUE16-NEXT: .LBB15_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-TRUE16-NEXT: .LBB15_5: ; %end +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v32, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v32, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v32, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v32, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v32, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v32, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v18i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_3: +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <36 x i16> %a, splat (i16 3) + %a2 = bitcast <36 x i16> %a1 to <18 x i32> + br label %end + +cmp.false: + %a3 = bitcast <36 x i16> %a to <18 x i32> + br label %end + +end: + %phi = phi <18 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x i32> %phi +} + define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v18i32_to_v36f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v42 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v53 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v51 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v49 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v39 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v37 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v41, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v8, v7 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v10, v35, v37 -; GCN-NEXT: v_or_b32_e32 v11, v33, v49 -; GCN-NEXT: v_or_b32_e32 v13, v31, v52 -; GCN-NEXT: v_or_b32_e32 v14, v29, v50 -; GCN-NEXT: v_or_b32_e32 v16, v27, v48 -; GCN-NEXT: v_or_b32_e32 v17, v25, v38 -; GCN-NEXT: v_or_b32_e32 v24, v24, v36 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v21, v21, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: v_or_b32_e32 v19, v19, v26 -; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18i32_to_v36f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i32_to_v36f16: ; VI: ; %bb.0: @@ -2468,7 +5378,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -2488,9 +5398,9 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB8_2: ; %Flow +; VI-NEXT: .LBB16_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_4 +; VI-NEXT: s_cbranch_execz .LBB16_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 @@ -2528,7 +5438,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB8_4: ; %end +; VI-NEXT: .LBB16_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 @@ -2592,7 +5502,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -2612,9 +5522,9 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB8_2: ; %Flow +; GFX9-NEXT: .LBB16_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: s_cbranch_execz .LBB16_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 ; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 @@ -2652,7 +5562,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB8_4: ; %end +; GFX9-NEXT: .LBB16_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 @@ -2683,7 +5593,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 @@ -2703,7 +5613,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2732,7 +5642,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -2752,9 +5662,9 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 @@ -2792,7 +5702,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 @@ -2831,345 +5741,1213 @@ end: ret <36 x half> %phi } +define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18i32_to_v36f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: v_readfirstlane_b32 s7, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s10, s18, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s20, 16 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s22, 16 +; SI-NEXT: s_lshr_b32 s15, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s24, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s26, 16 +; SI-NEXT: s_lshr_b32 s43, s27, 16 +; SI-NEXT: s_lshr_b32 s44, s28, 16 +; SI-NEXT: s_lshr_b32 s45, s29, 16 +; SI-NEXT: s_lshr_b32 s46, s8, 16 +; SI-NEXT: s_lshr_b32 s47, s7, 16 +; SI-NEXT: s_lshr_b32 s56, s6, 16 +; SI-NEXT: s_lshr_b32 s57, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v18i32_to_v36f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_readfirstlane_b32 s9, v0 +; VI-NEXT: v_readfirstlane_b32 s8, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s8, 16 +; VI-NEXT: s_lshr_b32 s13, s9, 16 +; VI-NEXT: s_lshr_b32 s14, s29, 16 +; VI-NEXT: s_lshr_b32 s15, s28, 16 +; VI-NEXT: s_lshr_b32 s40, s27, 16 +; VI-NEXT: s_lshr_b32 s41, s26, 16 +; VI-NEXT: s_lshr_b32 s42, s25, 16 +; VI-NEXT: s_lshr_b32 s43, s24, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s8, 16 +; VI-NEXT: s_lshr_b32 s13, s9, 16 +; VI-NEXT: s_lshr_b32 s14, s29, 16 +; VI-NEXT: s_lshr_b32 s15, s28, 16 +; VI-NEXT: s_lshr_b32 s40, s27, 16 +; VI-NEXT: s_lshr_b32 s41, s26, 16 +; VI-NEXT: s_lshr_b32 s42, s25, 16 +; VI-NEXT: s_lshr_b32 s43, s24, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s59, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s58, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s57, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s56, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s47, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s46, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s45, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s44, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s43, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s42, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s41, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s40, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s15, s26, s15 +; VI-NEXT: s_and_b32 s26, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s14, s26, s14 +; VI-NEXT: s_or_b32 s9, s9, s13 +; VI-NEXT: s_or_b32 s8, s8, s12 +; VI-NEXT: s_or_b32 s6, s6, s11 +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s15 +; VI-NEXT: v_mov_b32_e32 v13, s14 +; VI-NEXT: v_mov_b32_e32 v14, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v18i32_to_v36f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_lshr_b32 s12, s7, 16 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshr_b32 s14, s29, 16 +; GFX9-NEXT: s_lshr_b32 s15, s28, 16 +; GFX9-NEXT: s_lshr_b32 s40, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s42, s25, 16 +; GFX9-NEXT: s_lshr_b32 s43, s24, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s56, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_lshr_b32 s12, s7, 16 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshr_b32 s14, s29, 16 +; GFX9-NEXT: s_lshr_b32 s15, s28, 16 +; GFX9-NEXT: s_lshr_b32 s40, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s42, s25, 16 +; GFX9-NEXT: s_lshr_b32 s43, s24, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s56, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s29, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s15 +; GFX9-NEXT: v_mov_b32_e32 v13, s14 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-TRUE16-LABEL: bitcast_v18i32_to_v36f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: .LBB17_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s18, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s19, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s18, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s19, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s18, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s19, s4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s15 :: v_dual_mov_b32 v7, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v9, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB17_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB17_2 +; +; GFX11-FAKE16-LABEL: bitcast_v18i32_to_v36f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-FAKE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-FAKE16-NEXT: .LBB17_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s20, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s24, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s25, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s26, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s27, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s28, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s29, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s15 :: v_dual_mov_b32 v7, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB17_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <18 x i32> %a, splat (i32 3) + %a2 = bitcast <18 x i32> %a1 to <36 x half> + br label %end + +cmp.false: + %a3 = bitcast <18 x i32> %a to <36 x half> + br label %end + +end: + %phi = phi <36 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x half> %phi +} + define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v36f16_to_v18i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v32, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; GCN-NEXT: v_or_b32_e32 v3, v60, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v39 -; GCN-NEXT: v_or_b32_e32 v4, v46, v4 -; GCN-NEXT: v_or_b32_e32 v5, v44, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v40, v7 -; GCN-NEXT: v_or_b32_e32 v8, v54, v8 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: v_or_b32_e32 v10, v51, v10 -; GCN-NEXT: v_or_b32_e32 v11, v48, v11 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v18, v12 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v18, v14 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v18, v15 -; GCN-NEXT: v_or_b32_e32 v16, v37, v16 -; GCN-NEXT: v_or_b32_e32 v17, v36, v17 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v62 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v60 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v55 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v53 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v50 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v49 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v36 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_or_b32_e32 v6, v9, v8 -; GCN-NEXT: v_or_b32_e32 v7, v11, v10 -; GCN-NEXT: v_or_b32_e32 v8, v13, v12 -; GCN-NEXT: v_or_b32_e32 v9, v15, v14 -; GCN-NEXT: v_or_b32_e32 v10, v17, v16 -; GCN-NEXT: v_or_b32_e32 v11, v19, v18 -; GCN-NEXT: v_or_b32_e32 v12, v21, v20 -; GCN-NEXT: v_or_b32_e32 v13, v23, v22 -; GCN-NEXT: v_or_b32_e32 v14, v25, v24 -; GCN-NEXT: v_or_b32_e32 v15, v27, v26 -; GCN-NEXT: v_or_b32_e32 v16, v29, v28 -; GCN-NEXT: v_or_b32_e32 v17, v31, v30 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36f16_to_v18i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v62, v2 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_or_b32_e32 v4, v58, v4 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_or_b32_e32 v7, v44, v7 +; SI-NEXT: v_or_b32_e32 v8, v42, v8 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v52, v11 +; SI-NEXT: v_or_b32_e32 v12, v50, v12 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; SI-NEXT: v_or_b32_e32 v17, v36, v17 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36f16_to_v18i32: ; VI: ; %bb.0: @@ -3198,7 +6976,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v17, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v17, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -3255,9 +7033,9 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB9_2: ; %Flow +; VI-NEXT: .LBB18_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_4 +; VI-NEXT: s_cbranch_execz .LBB18_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v41, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3314,7 +7092,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v18, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: .LBB9_4: ; %end +; VI-NEXT: .LBB18_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -3384,7 +7162,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; kill: killed $vgpr18 @@ -3449,9 +7227,9 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: .LBB9_2: ; %Flow +; GFX9-NEXT: .LBB18_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -3499,7 +7277,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_4: ; %end +; GFX9-NEXT: .LBB18_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -3528,7 +7306,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -3548,7 +7326,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: .LBB18_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3596,7 +7374,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -3616,7 +7394,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: .LBB18_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3636,37 +7414,912 @@ end: ret <18 x i32> %phi } +define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36f16_to_v18i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v41, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v3, v63, v3 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v39, v10 +; SI-NEXT: v_or_b32_e32 v11, v37, v11 +; SI-NEXT: v_or_b32_e32 v12, v31, v12 +; SI-NEXT: v_or_b32_e32 v13, v29, v13 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_or_b32_e32 v15, v25, v15 +; SI-NEXT: v_or_b32_e32 v16, v23, v16 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v27 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v38, v23 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v39 +; SI-NEXT: v_mov_b32_e32 v39, v24 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v25 +; SI-NEXT: v_mov_b32_e32 v32, v44 +; SI-NEXT: v_mov_b32_e32 v44, v49 +; SI-NEXT: v_mov_b32_e32 v49, v26 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: v_mov_b32_e32 v50, v27 +; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v51, v28 +; SI-NEXT: v_mov_b32_e32 v52, v29 +; SI-NEXT: v_mov_b32_e32 v53, v30 +; SI-NEXT: v_mov_b32_e32 v54, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v24, v39 +; SI-NEXT: v_mov_b32_e32 v39, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v38 +; SI-NEXT: v_mov_b32_e32 v38, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v18, v37 +; SI-NEXT: v_mov_b32_e32 v37, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v19, v36 +; SI-NEXT: v_mov_b32_e32 v36, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v54 +; SI-NEXT: v_mov_b32_e32 v30, v53 +; SI-NEXT: v_mov_b32_e32 v29, v52 +; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v27, v50 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v26, v49 +; SI-NEXT: v_mov_b32_e32 v49, v44 +; SI-NEXT: v_mov_b32_e32 v44, v32 +; SI-NEXT: v_mov_b32_e32 v25, v48 +; SI-NEXT: v_mov_b32_e32 v48, v43 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v36f16_to_v18i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v32, v3 +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v17, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v35, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v16, v18, v16 +; VI-NEXT: v_add_f16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v36f16_to_v18i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v3 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_mov_b32_e32 v35, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v18i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s54, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s55, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s64, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s65, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s66, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s67, 15 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s36, s40, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s37, s41, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s48, s56, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s49, s57, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s38, s40, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s39, s41, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s50, s56, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s51, s57, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s40, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s41, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s42, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s43, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s44, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s45, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s46, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s52, s56, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s53, s57, s4 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s74 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_branch .LBB19_5 +; GFX11-TRUE16-NEXT: .LBB19_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GFX11-TRUE16-NEXT: s_branch .LBB19_2 +; GFX11-TRUE16-NEXT: .LBB19_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-TRUE16-NEXT: .LBB19_5: ; %end +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v32, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v32, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v32, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v32, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v32, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v32, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v18i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB19_3: +; GFX11-FAKE16-NEXT: s_branch .LBB19_2 +; GFX11-FAKE16-NEXT: .LBB19_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <36 x half> %a, splat (half 0xH0200) + %a2 = bitcast <36 x half> %a1 to <18 x i32> + br label %end + +cmp.false: + %a3 = bitcast <36 x half> %a to <18 x i32> + br label %end + +end: + %phi = phi <18 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x i32> %phi +} + define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v18f32_to_v9i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18f32_to_v9i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f32_to_v9i64: ; VI: ; %bb.0: @@ -3675,7 +8328,7 @@ define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 @@ -3695,7 +8348,7 @@ define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3706,7 +8359,7 @@ define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 @@ -3726,7 +8379,7 @@ define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3738,7 +8391,7 @@ define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 @@ -3749,7 +8402,7 @@ define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3769,37 +8422,304 @@ end: ret <9 x i64> %phi } +define inreg <9 x i64> @bitcast_v18f32_to_v9i64_scalar(<18 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18f32_to_v9i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v18f32_to_v9i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v18f32_to_v9i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v18f32_to_v9i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s47, s23 +; GFX11-NEXT: s_mov_b32 s46, s22 +; GFX11-NEXT: s_mov_b32 s45, s21 +; GFX11-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-NEXT: s_mov_b32 s44, s20 +; GFX11-NEXT: s_mov_b32 s43, s19 +; GFX11-NEXT: s_mov_b32 s42, s18 +; GFX11-NEXT: s_mov_b32 s41, s17 +; GFX11-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-NEXT: s_mov_b32 s40, s16 +; GFX11-NEXT: s_mov_b32 s38, s2 +; GFX11-NEXT: s_mov_b32 s37, s1 +; GFX11-NEXT: s_mov_b32 s36, s0 +; GFX11-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-NEXT: s_mov_b32 s39, s3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-NEXT: s_mov_b32 s48, s24 +; GFX11-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-NEXT: s_mov_b32 s49, s25 +; GFX11-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-NEXT: s_mov_b32 s50, s26 +; GFX11-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-NEXT: s_mov_b32 s51, s27 +; GFX11-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-NEXT: s_mov_b32 s52, s28 +; GFX11-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-NEXT: s_mov_b32 s53, s29 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v17, s53, 1.0 +; GFX11-NEXT: v_add_f32_e64 v16, s52, 1.0 +; GFX11-NEXT: v_add_f32_e64 v15, s51, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s50, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s49, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s48, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s47, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s46, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s45, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s44, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s43, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s42, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s41, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s40, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s39, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s38, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s37, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s36, 1.0 +; GFX11-NEXT: s_branch .LBB21_5 +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: s_branch .LBB21_2 +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-NEXT: .LBB21_5: ; %end +; GFX11-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <18 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <18 x float> %a1 to <9 x i64> + br label %end + +cmp.false: + %a3 = bitcast <18 x float> %a to <9 x i64> + br label %end + +end: + %phi = phi <9 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i64> %phi +} + define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v9i64_to_v18f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9i64_to_v18f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i64_to_v18f32: ; VI: ; %bb.0: @@ -3808,7 +8728,7 @@ define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 ; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc @@ -3828,7 +8748,7 @@ define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3839,7 +8759,7 @@ define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc @@ -3859,7 +8779,7 @@ define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3871,7 +8791,7 @@ define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -3896,67 +8816,453 @@ define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i64> %a, splat (i64 3) + %a2 = bitcast <9 x i64> %a1 to <18 x float> + br label %end + +cmp.false: + %a3 = bitcast <9 x i64> %a to <18 x float> + br label %end + +end: + %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x float> %phi +} + +define inreg <18 x float> @bitcast_v9i64_to_v18f32_scalar(<9 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9i64_to_v18f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v9i64_to_v18f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v9i64_to_v18f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v9i64_to_v18f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s28, s28, 3 +; GFX11-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB23_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: s_branch .LBB23_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i64> %a, splat (i64 3) + %a2 = bitcast <9 x i64> %a1 to <18 x float> + br label %end + +cmp.false: + %a3 = bitcast <9 x i64> %a to <18 x float> + br label %end + +end: + %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x float> %phi +} + +define <9 x double> @bitcast_v18f32_to_v9f64(<18 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v18f32_to_v9f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v18f32_to_v9f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v18f32_to_v9f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v18f32_to_v9f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB24_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <9 x i64> %a, splat (i64 3) - %a2 = bitcast <9 x i64> %a1 to <18 x float> + %a1 = fadd <18 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <18 x float> %a1 to <9 x double> br label %end cmp.false: - %a3 = bitcast <9 x i64> %a to <18 x float> + %a3 = bitcast <18 x float> %a to <9 x double> br label %end end: - %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <18 x float> %phi + %phi = phi <9 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x double> %phi } -define <9 x double> @bitcast_v18f32_to_v9f64(<18 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v18f32_to_v9f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <9 x double> @bitcast_v18f32_to_v9f64_scalar(<18 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18f32_to_v9f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: s_branch .LBB25_2 ; -; VI-LABEL: bitcast_v18f32_to_v9f64: +; VI-LABEL: bitcast_v18f32_to_v9f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -3975,19 +9281,39 @@ define <9 x double> @bitcast_v18f32_to_v9f64(<18 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB25_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: s_branch .LBB25_2 ; -; GFX9-LABEL: bitcast_v18f32_to_v9f64: +; GFX9-LABEL: bitcast_v18f32_to_v9f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -4006,31 +9332,106 @@ define <9 x double> @bitcast_v18f32_to_v9f64(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB25_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: s_branch .LBB25_2 ; -; GFX11-LABEL: bitcast_v18f32_to_v9f64: +; GFX11-LABEL: bitcast_v18f32_to_v9f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB12_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s47, s23 +; GFX11-NEXT: s_mov_b32 s46, s22 +; GFX11-NEXT: s_mov_b32 s45, s21 +; GFX11-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-NEXT: s_mov_b32 s44, s20 +; GFX11-NEXT: s_mov_b32 s43, s19 +; GFX11-NEXT: s_mov_b32 s42, s18 +; GFX11-NEXT: s_mov_b32 s41, s17 +; GFX11-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-NEXT: s_mov_b32 s40, s16 +; GFX11-NEXT: s_mov_b32 s38, s2 +; GFX11-NEXT: s_mov_b32 s37, s1 +; GFX11-NEXT: s_mov_b32 s36, s0 +; GFX11-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-NEXT: s_mov_b32 s39, s3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-NEXT: s_mov_b32 s48, s24 +; GFX11-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-NEXT: s_mov_b32 s49, s25 +; GFX11-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-NEXT: s_mov_b32 s50, s26 +; GFX11-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-NEXT: s_mov_b32 s51, s27 +; GFX11-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-NEXT: s_mov_b32 s52, s28 +; GFX11-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-NEXT: s_mov_b32 s53, s29 +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v17, s53, 1.0 +; GFX11-NEXT: v_add_f32_e64 v16, s52, 1.0 +; GFX11-NEXT: v_add_f32_e64 v15, s51, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s50, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s49, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s48, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s47, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s46, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s45, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s44, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s43, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s42, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s41, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s40, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s39, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s38, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s37, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s36, 1.0 +; GFX11-NEXT: s_branch .LBB25_5 +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: s_branch .LBB25_2 +; GFX11-NEXT: .LBB25_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-NEXT: .LBB25_5: ; %end +; GFX11-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4050,27 +9451,27 @@ end: } define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v9f64_to_v18f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9f64_to_v18f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f64_to_v18f32: ; VI: ; %bb.0: @@ -4079,7 +9480,7 @@ define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_cbranch_execz .LBB26_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -4090,7 +9491,7 @@ define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4101,7 +9502,7 @@ define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -4112,7 +9513,7 @@ define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4124,7 +9525,7 @@ define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -4135,7 +9536,7 @@ define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4155,184 +9556,432 @@ end: ret <18 x float> %phi } +define inreg <18 x float> @bitcast_v9f64_to_v18f32_scalar(<9 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9f64_to_v18f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v9f64_to_v18f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v9f64_to_v18f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v9f64_to_v18f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s47, s23 +; GFX11-NEXT: s_mov_b32 s46, s22 +; GFX11-NEXT: s_mov_b32 s45, s21 +; GFX11-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-NEXT: s_mov_b32 s44, s20 +; GFX11-NEXT: s_mov_b32 s43, s19 +; GFX11-NEXT: s_mov_b32 s42, s18 +; GFX11-NEXT: s_mov_b32 s41, s17 +; GFX11-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-NEXT: s_mov_b32 s40, s16 +; GFX11-NEXT: s_mov_b32 s38, s2 +; GFX11-NEXT: s_mov_b32 s37, s1 +; GFX11-NEXT: s_mov_b32 s36, s0 +; GFX11-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-NEXT: s_mov_b32 s39, s3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-NEXT: s_mov_b32 s48, s24 +; GFX11-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-NEXT: s_mov_b32 s49, s25 +; GFX11-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-NEXT: s_mov_b32 s50, s26 +; GFX11-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-NEXT: s_mov_b32 s51, s27 +; GFX11-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-NEXT: s_mov_b32 s52, s28 +; GFX11-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-NEXT: s_mov_b32 s53, s29 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_4 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[16:17], s[52:53], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], s[50:51], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[48:49], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[46:47], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[44:45], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[42:43], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[40:41], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[38:39], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[36:37], 1.0 +; GFX11-NEXT: s_branch .LBB27_5 +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: s_branch .LBB27_2 +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-NEXT: .LBB27_5: ; %end +; GFX11-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <9 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <9 x double> %a1 to <18 x float> + br label %end + +cmp.false: + %a3 = bitcast <9 x double> %a to <18 x float> + br label %end + +end: + %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x float> %phi +} + define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v18f32_to_v36i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v1, v1, v33 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v36 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v3, v3, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v4, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v6, v6, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v7, v7, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v8, v8, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v9, v9, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v10, v10, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v12, v12, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v13, v13, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v14, v14, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v15, v15, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v16, v16, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v17, v17, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v18, v18, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v23, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18f32_to_v36i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v33 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f32_to_v36i16: ; VI: ; %bb.0: @@ -4358,7 +10007,7 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -4378,9 +10027,9 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB14_2: ; %Flow +; VI-NEXT: .LBB28_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_4 +; VI-NEXT: s_cbranch_execz .LBB28_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 @@ -4418,7 +10067,7 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB14_4: ; %end +; VI-NEXT: .LBB28_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 @@ -4482,7 +10131,7 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -4502,9 +10151,9 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB14_2: ; %Flow +; GFX9-NEXT: .LBB28_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: s_cbranch_execz .LBB28_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 @@ -4542,7 +10191,7 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB14_4: ; %end +; GFX9-NEXT: .LBB28_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 @@ -4573,7 +10222,7 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 @@ -4584,7 +10233,7 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4613,7 +10262,7 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -4633,9 +10282,9 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB28_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 @@ -4649,42 +10298,852 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB28_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <18 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <18 x float> %a1 to <36 x i16> + br label %end + +cmp.false: + %a3 = bitcast <18 x float> %a to <36 x i16> + br label %end + +end: + %phi = phi <36 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x i16> %phi +} + +define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18f32_to_v36i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_mov_b32_e32 v19, s16 +; SI-NEXT: v_mov_b32_e32 v18, s17 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v8, s26 +; SI-NEXT: v_mov_b32_e32 v7, s27 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v5, s29 +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v20, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v21, v5, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v7, v8, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v10, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v33, v18, v19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_alignbit_b32 v17, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v20, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v21, v5, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v7, v8, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v10, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v33, v18, v19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v33 +; SI-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v16, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v32 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v28 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_v18f32_to_v36i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v18, s16 +; VI-NEXT: v_mov_b32_e32 v17, s17 +; VI-NEXT: v_mov_b32_e32 v16, s18 +; VI-NEXT: v_mov_b32_e32 v15, s19 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB29_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_or_b32_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_or_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v7, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_or_b32_sdwa v8, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; VI-NEXT: v_or_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; VI-NEXT: v_or_b32_sdwa v21, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; VI-NEXT: v_or_b32_sdwa v18, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: v_or_b32_sdwa v19, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v20 +; VI-NEXT: v_mov_b32_e32 v1, v21 +; VI-NEXT: v_mov_b32_e32 v2, v18 +; VI-NEXT: v_mov_b32_e32 v3, v19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: s_branch .LBB29_2 +; +; GFX9-LABEL: bitcast_v18f32_to_v36i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v17, s17 +; GFX9-NEXT: v_mov_b32_e32 v16, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v14, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v25, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v35, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v16, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v5, v34, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v32, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v31, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v30, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v29, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v27, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v26, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v17, v22, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v20 +; GFX9-NEXT: v_mov_b32_e32 v1, v21 +; GFX9-NEXT: v_mov_b32_e32 v2, v18 +; GFX9-NEXT: v_mov_b32_e32 v3, v19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: s_branch .LBB29_2 +; +; GFX11-TRUE16-LABEL: bitcast_v18f32_to_v36i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, s28, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, s26, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, s25, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, s23, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, s21, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, s20, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s18, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, s16, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, s3, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, s2, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, s0, 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-TRUE16-NEXT: s_branch .LBB29_5 +; GFX11-TRUE16-NEXT: .LBB29_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB29_2 +; GFX11-TRUE16-NEXT: .LBB29_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s5 +; GFX11-TRUE16-NEXT: .LBB29_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v18f32_to_v36i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, s28, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, s26, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, s25, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, s23, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, s21, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, s20, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, s18, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, s16, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, s3, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, s2, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, s0, 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: s_branch .LBB29_5 +; GFX11-FAKE16-NEXT: .LBB29_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: s_branch .LBB29_2 +; GFX11-FAKE16-NEXT: .LBB29_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v3, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v11, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v9, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v17, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v13, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5 +; GFX11-FAKE16-NEXT: .LBB29_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4704,268 +11163,267 @@ end: } define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v36i16_to_v18f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v40 -; GCN-NEXT: v_or_b32_e32 v1, v1, v41 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v18 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: v_or_b32_e32 v8, v8, v44 -; GCN-NEXT: v_or_b32_e32 v9, v9, v45 -; GCN-NEXT: v_or_b32_e32 v10, v10, v46 -; GCN-NEXT: v_or_b32_e32 v11, v11, v47 -; GCN-NEXT: v_or_b32_e32 v12, v12, v56 -; GCN-NEXT: v_or_b32_e32 v13, v13, v57 -; GCN-NEXT: v_or_b32_e32 v14, v14, v58 -; GCN-NEXT: v_or_b32_e32 v15, v15, v59 -; GCN-NEXT: v_or_b32_e32 v16, v16, v60 -; GCN-NEXT: v_or_b32_e32 v17, v17, v61 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: .LBB15_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v0, v40, v0 -; GCN-NEXT: v_or_b32_e32 v1, v41, v1 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v18, v4 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v18, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: v_or_b32_e32 v8, v44, v8 -; GCN-NEXT: v_or_b32_e32 v9, v45, v9 -; GCN-NEXT: v_or_b32_e32 v10, v46, v10 -; GCN-NEXT: v_or_b32_e32 v11, v47, v11 -; GCN-NEXT: v_or_b32_e32 v12, v56, v12 -; GCN-NEXT: v_or_b32_e32 v13, v57, v13 -; GCN-NEXT: v_or_b32_e32 v14, v58, v14 -; GCN-NEXT: v_or_b32_e32 v15, v59, v15 -; GCN-NEXT: v_or_b32_e32 v16, v60, v16 -; GCN-NEXT: v_or_b32_e32 v17, v61, v17 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: .LBB15_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36i16_to_v18f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v36, v22 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v40 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v62 +; SI-NEXT: v_or_b32_e32 v6, v6, v61 +; SI-NEXT: v_or_b32_e32 v7, v7, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v59 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v43 +; SI-NEXT: v_or_b32_e32 v17, v17, v42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v12, v12, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v46 +; SI-NEXT: v_or_b32_e32 v14, v14, v45 +; SI-NEXT: v_or_b32_e32 v15, v15, v44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB30_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v63, v4 +; SI-NEXT: v_or_b32_e32 v5, v62, v5 +; SI-NEXT: v_or_b32_e32 v6, v61, v6 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_or_b32_e32 v8, v59, v8 +; SI-NEXT: v_or_b32_e32 v9, v58, v9 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v11, v56, v11 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_or_b32_e32 v17, v42, v17 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v13, v46, v13 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v44, v15 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: .LBB30_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36i16_to_v18f32: ; VI: ; %bb.0: @@ -4994,7 +11452,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v17, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v17, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -5051,9 +11509,9 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB15_2: ; %Flow +; VI-NEXT: .LBB30_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_4 +; VI-NEXT: s_cbranch_execz .LBB30_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v41 @@ -5110,7 +11568,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v18, 3, v32 ; VI-NEXT: v_add_u16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: .LBB15_4: ; %end +; VI-NEXT: .LBB30_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5180,7 +11638,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; kill: killed $vgpr18 @@ -5245,9 +11703,9 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: .LBB15_2: ; %Flow +; GFX9-NEXT: .LBB30_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -5294,7 +11752,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_4: ; %end +; GFX9-NEXT: .LBB30_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5323,7 +11781,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -5343,7 +11801,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB15_2: ; %end +; GFX11-TRUE16-NEXT: .LBB30_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5391,7 +11849,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -5411,7 +11869,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB15_2: ; %end +; GFX11-FAKE16-NEXT: .LBB30_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5431,337 +11889,1115 @@ end: ret <18 x float> %phi } +define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36i16_to_v18f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v20 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v34, v16 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v8 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v4 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v21 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v7, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v8, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v16, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v17, v0, v51 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v36i16_to_v18f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v32, v3 +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v36i16_to_v18f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v3 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_mov_b32_e32 v35, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v18f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s54, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s55, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s64, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s65, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s66, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s67, 15 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s36, s40, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s37, s41, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s48, s56, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s49, s57, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s38, s40, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s39, s41, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s50, s56, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s51, s57, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s40, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s41, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s42, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s43, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s44, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s45, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s46, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s52, s56, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s53, s57, s4 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s74 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_branch .LBB31_5 +; GFX11-TRUE16-NEXT: .LBB31_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GFX11-TRUE16-NEXT: s_branch .LBB31_2 +; GFX11-TRUE16-NEXT: .LBB31_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-TRUE16-NEXT: .LBB31_5: ; %end +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v32, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v32, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v32, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v32, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v32, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v32, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v18f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB31_3: +; GFX11-FAKE16-NEXT: s_branch .LBB31_2 +; GFX11-FAKE16-NEXT: .LBB31_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <36 x i16> %a, splat (i16 3) + %a2 = bitcast <36 x i16> %a1 to <18 x float> + br label %end + +cmp.false: + %a3 = bitcast <36 x i16> %a to <18 x float> + br label %end + +end: + %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x float> %phi +} + define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v18f32_to_v36f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: .LBB16_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: .LBB16_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v42 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v53 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v51 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v49 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v39 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v37 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v41, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v8, v7 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v10, v35, v37 -; GCN-NEXT: v_or_b32_e32 v11, v33, v49 -; GCN-NEXT: v_or_b32_e32 v13, v31, v52 -; GCN-NEXT: v_or_b32_e32 v14, v29, v50 -; GCN-NEXT: v_or_b32_e32 v16, v27, v48 -; GCN-NEXT: v_or_b32_e32 v17, v25, v38 -; GCN-NEXT: v_or_b32_e32 v24, v24, v36 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v21, v21, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: v_or_b32_e32 v19, v19, v26 -; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18f32_to_v36f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: .LBB32_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f32_to_v36f16: ; VI: ; %bb.0: @@ -5787,7 +13023,7 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -5807,9 +13043,9 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB16_2: ; %Flow +; VI-NEXT: .LBB32_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_4 +; VI-NEXT: s_cbranch_execz .LBB32_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 @@ -5847,7 +13083,7 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB16_4: ; %end +; VI-NEXT: .LBB32_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 @@ -5911,7 +13147,7 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -5931,9 +13167,9 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB16_2: ; %Flow +; GFX9-NEXT: .LBB32_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: s_cbranch_execz .LBB32_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 @@ -5971,7 +13207,7 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB16_4: ; %end +; GFX9-NEXT: .LBB32_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 @@ -6002,7 +13238,7 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 @@ -6013,7 +13249,7 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: .LBB32_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6042,78 +13278,986 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <18 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <18 x float> %a1 to <36 x half> + br label %end + +cmp.false: + %a3 = bitcast <18 x float> %a to <36 x half> + br label %end + +end: + %phi = phi <36 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x half> %phi +} + +define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18f32_to_v36f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: v_readfirstlane_b32 s7, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s9, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v18f32_to_v36f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v18, s16 +; VI-NEXT: v_mov_b32_e32 v17, s17 +; VI-NEXT: v_mov_b32_e32 v16, s18 +; VI-NEXT: v_mov_b32_e32 v15, s19 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB33_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_or_b32_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_or_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v7, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_or_b32_sdwa v8, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; VI-NEXT: v_or_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; VI-NEXT: v_or_b32_sdwa v21, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; VI-NEXT: v_or_b32_sdwa v18, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: v_or_b32_sdwa v19, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v20 +; VI-NEXT: v_mov_b32_e32 v1, v21 +; VI-NEXT: v_mov_b32_e32 v2, v18 +; VI-NEXT: v_mov_b32_e32 v3, v19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: s_branch .LBB33_2 +; +; GFX9-LABEL: bitcast_v18f32_to_v36f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v17, s17 +; GFX9-NEXT: v_mov_b32_e32 v16, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v14, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v25, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v35, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v16, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v5, v34, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v32, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v31, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v30, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v29, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v27, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v26, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v17, v22, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v20 +; GFX9-NEXT: v_mov_b32_e32 v1, v21 +; GFX9-NEXT: v_mov_b32_e32 v2, v18 +; GFX9-NEXT: v_mov_b32_e32 v3, v19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: s_branch .LBB33_2 +; +; GFX11-TRUE16-LABEL: bitcast_v18f32_to_v36f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, s28, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, s26, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, s25, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, s23, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, s21, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, s20, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s18, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, s16, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, s3, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, s2, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, s0, 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-TRUE16-NEXT: s_branch .LBB33_5 +; GFX11-TRUE16-NEXT: .LBB33_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB33_2 +; GFX11-TRUE16-NEXT: .LBB33_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s5 +; GFX11-TRUE16-NEXT: .LBB33_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v18f32_to_v36f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, s28, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, s26, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, s25, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, s23, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, s21, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, s20, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, s18, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, s16, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, s3, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, s2, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, s0, 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: s_branch .LBB33_5 +; GFX11-FAKE16-NEXT: .LBB33_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: s_branch .LBB33_2 +; GFX11-FAKE16-NEXT: .LBB33_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v3, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v11, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v9, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v17, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v13, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5 +; GFX11-FAKE16-NEXT: .LBB33_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6133,344 +14277,361 @@ end: } define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v36f16_to_v18f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v32, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; GCN-NEXT: v_or_b32_e32 v3, v60, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v39 -; GCN-NEXT: v_or_b32_e32 v4, v46, v4 -; GCN-NEXT: v_or_b32_e32 v5, v44, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v40, v7 -; GCN-NEXT: v_or_b32_e32 v8, v54, v8 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: v_or_b32_e32 v10, v51, v10 -; GCN-NEXT: v_or_b32_e32 v11, v48, v11 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v18, v12 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v18, v14 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v18, v15 -; GCN-NEXT: v_or_b32_e32 v16, v37, v16 -; GCN-NEXT: v_or_b32_e32 v17, v36, v17 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: .LBB17_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v62 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v60 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v55 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v53 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v50 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v49 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v36 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_or_b32_e32 v6, v9, v8 -; GCN-NEXT: v_or_b32_e32 v7, v11, v10 -; GCN-NEXT: v_or_b32_e32 v8, v13, v12 -; GCN-NEXT: v_or_b32_e32 v9, v15, v14 -; GCN-NEXT: v_or_b32_e32 v10, v17, v16 -; GCN-NEXT: v_or_b32_e32 v11, v19, v18 -; GCN-NEXT: v_or_b32_e32 v12, v21, v20 -; GCN-NEXT: v_or_b32_e32 v13, v23, v22 -; GCN-NEXT: v_or_b32_e32 v14, v25, v24 -; GCN-NEXT: v_or_b32_e32 v15, v27, v26 -; GCN-NEXT: v_or_b32_e32 v16, v29, v28 -; GCN-NEXT: v_or_b32_e32 v17, v31, v30 -; GCN-NEXT: .LBB17_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36f16_to_v18f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v62, v2 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_or_b32_e32 v4, v58, v4 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_or_b32_e32 v7, v44, v7 +; SI-NEXT: v_or_b32_e32 v8, v42, v8 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v52, v11 +; SI-NEXT: v_or_b32_e32 v12, v50, v12 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; SI-NEXT: v_or_b32_e32 v17, v36, v17 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36f16_to_v18f32: ; VI: ; %bb.0: @@ -6499,7 +14660,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v17, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v17, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -6556,9 +14717,9 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB17_2: ; %Flow +; VI-NEXT: .LBB34_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_4 +; VI-NEXT: s_cbranch_execz .LBB34_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v41, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -6615,7 +14776,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v18, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: .LBB17_4: ; %end +; VI-NEXT: .LBB34_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -6685,7 +14846,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; kill: killed $vgpr18 @@ -6750,9 +14911,9 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: .LBB17_2: ; %Flow +; GFX9-NEXT: .LBB34_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_4 +; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -6800,7 +14961,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_4: ; %end +; GFX9-NEXT: .LBB34_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -6829,7 +14990,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -6849,7 +15010,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6897,7 +15058,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -6917,7 +15078,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6933,51 +15094,1112 @@ cmp.false: br label %end end: - %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <18 x float> %phi + %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x float> %phi +} + +define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36f16_to_v18f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v41, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v3, v63, v3 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v39, v10 +; SI-NEXT: v_or_b32_e32 v11, v37, v11 +; SI-NEXT: v_or_b32_e32 v12, v31, v12 +; SI-NEXT: v_or_b32_e32 v13, v29, v13 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_or_b32_e32 v15, v25, v15 +; SI-NEXT: v_or_b32_e32 v16, v23, v16 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v27 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v38, v23 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v39 +; SI-NEXT: v_mov_b32_e32 v39, v24 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v25 +; SI-NEXT: v_mov_b32_e32 v32, v44 +; SI-NEXT: v_mov_b32_e32 v44, v49 +; SI-NEXT: v_mov_b32_e32 v49, v26 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: v_mov_b32_e32 v50, v27 +; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v51, v28 +; SI-NEXT: v_mov_b32_e32 v52, v29 +; SI-NEXT: v_mov_b32_e32 v53, v30 +; SI-NEXT: v_mov_b32_e32 v54, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v24, v39 +; SI-NEXT: v_mov_b32_e32 v39, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v38 +; SI-NEXT: v_mov_b32_e32 v38, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v18, v37 +; SI-NEXT: v_mov_b32_e32 v37, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v19, v36 +; SI-NEXT: v_mov_b32_e32 v36, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v54 +; SI-NEXT: v_mov_b32_e32 v30, v53 +; SI-NEXT: v_mov_b32_e32 v29, v52 +; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v27, v50 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v26, v49 +; SI-NEXT: v_mov_b32_e32 v49, v44 +; SI-NEXT: v_mov_b32_e32 v44, v32 +; SI-NEXT: v_mov_b32_e32 v25, v48 +; SI-NEXT: v_mov_b32_e32 v48, v43 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v36f16_to_v18f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v32, v3 +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v17, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v35, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v16, v18, v16 +; VI-NEXT: v_add_f16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v36f16_to_v18f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v3 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_mov_b32_e32 v35, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v18f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s54, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s55, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s64, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s65, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s66, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s67, 15 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s36, s40, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s37, s41, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s48, s56, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s49, s57, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s38, s40, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s39, s41, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s50, s56, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s51, s57, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s40, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s41, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s42, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s43, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s44, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s45, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s46, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s52, s56, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s53, s57, s4 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s74 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_branch .LBB35_5 +; GFX11-TRUE16-NEXT: .LBB35_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; GFX11-TRUE16-NEXT: .LBB35_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-TRUE16-NEXT: .LBB35_5: ; %end +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v32, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v32, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v32, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v32, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v32, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v32, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v18f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB35_3: +; GFX11-FAKE16-NEXT: s_branch .LBB35_2 +; GFX11-FAKE16-NEXT: .LBB35_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <36 x half> %a, splat (half 0xH0200) + %a2 = bitcast <36 x half> %a1 to <18 x float> + br label %end + +cmp.false: + %a3 = bitcast <36 x half> %a to <18 x float> + br label %end + +end: + %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x float> %phi +} + +define <9 x double> @bitcast_v9i64_to_v9f64(<9 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v9i64_to_v9f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v9i64_to_v9f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v9i64_to_v9f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v9i64_to_v9f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i64> %a, splat (i64 3) + %a2 = bitcast <9 x i64> %a1 to <9 x double> + br label %end + +cmp.false: + %a3 = bitcast <9 x i64> %a to <9 x double> + br label %end + +end: + %phi = phi <9 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x double> %phi } -define <9 x double> @bitcast_v9i64_to_v9f64(<9 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v9i64_to_v9f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <9 x double> @bitcast_v9i64_to_v9f64_scalar(<9 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9i64_to_v9f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 ; -; VI-LABEL: bitcast_v9i64_to_v9f64: +; VI-LABEL: bitcast_v9i64_to_v9f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 @@ -6996,19 +16218,39 @@ define <9 x double> @bitcast_v9i64_to_v9f64(<9 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 ; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: .LBB18_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB37_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 ; -; GFX9-LABEL: bitcast_v9i64_to_v9f64: +; GFX9-LABEL: bitcast_v9i64_to_v9f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 @@ -7027,46 +16269,53 @@ define <9 x double> @bitcast_v9i64_to_v9f64(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: .LBB18_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB37_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 ; -; GFX11-LABEL: bitcast_v9i64_to_v9f64: +; GFX11-LABEL: bitcast_v9i64_to_v9f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: .LBB18_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_3 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s28, s28, 3 +; GFX11-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-NEXT: .LBB37_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: s_branch .LBB37_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7085,27 +16334,27 @@ end: } define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v9f64_to_v9i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9f64_to_v9i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f64_to_v9i64: ; VI: ; %bb.0: @@ -7114,7 +16363,7 @@ define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -7125,7 +16374,7 @@ define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7136,7 +16385,7 @@ define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -7147,7 +16396,7 @@ define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end +; GFX9-NEXT: .LBB38_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7159,7 +16408,7 @@ define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -7170,7 +16419,7 @@ define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7190,184 +16439,432 @@ end: ret <9 x i64> %phi } +define inreg <9 x i64> @bitcast_v9f64_to_v9i64_scalar(<9 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9f64_to_v9i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v9f64_to_v9i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v9f64_to_v9i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v9f64_to_v9i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s47, s23 +; GFX11-NEXT: s_mov_b32 s46, s22 +; GFX11-NEXT: s_mov_b32 s45, s21 +; GFX11-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-NEXT: s_mov_b32 s44, s20 +; GFX11-NEXT: s_mov_b32 s43, s19 +; GFX11-NEXT: s_mov_b32 s42, s18 +; GFX11-NEXT: s_mov_b32 s41, s17 +; GFX11-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-NEXT: s_mov_b32 s40, s16 +; GFX11-NEXT: s_mov_b32 s38, s2 +; GFX11-NEXT: s_mov_b32 s37, s1 +; GFX11-NEXT: s_mov_b32 s36, s0 +; GFX11-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-NEXT: s_mov_b32 s39, s3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-NEXT: s_mov_b32 s48, s24 +; GFX11-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-NEXT: s_mov_b32 s49, s25 +; GFX11-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-NEXT: s_mov_b32 s50, s26 +; GFX11-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-NEXT: s_mov_b32 s51, s27 +; GFX11-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-NEXT: s_mov_b32 s52, s28 +; GFX11-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-NEXT: s_mov_b32 s53, s29 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[36:37], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[38:39], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[40:41], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[42:43], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[44:45], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[46:47], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[48:49], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], s[50:51], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], s[52:53], 1.0 +; GFX11-NEXT: s_branch .LBB39_5 +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: s_branch .LBB39_2 +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-NEXT: .LBB39_5: ; %end +; GFX11-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <9 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <9 x double> %a1 to <9 x i64> + br label %end + +cmp.false: + %a3 = bitcast <9 x double> %a to <9 x i64> + br label %end + +end: + %phi = phi <9 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i64> %phi +} + define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v9i64_to_v36i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v24, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v26, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v24, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v26, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v1, v1, v33 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v36 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v3, v3, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v4, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v6, v6, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v7, v7, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v8, v8, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v9, v9, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v10, v10, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v12, v12, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v13, v13, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v14, v14, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v15, v15, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v16, v16, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v17, v17, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v18, v18, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v23, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9i64_to_v36i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v24, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v24, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v33 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i64_to_v36i16: ; VI: ; %bb.0: @@ -7393,7 +16890,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -7413,9 +16910,9 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB20_2: ; %Flow +; VI-NEXT: .LBB40_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_4 +; VI-NEXT: s_cbranch_execz .LBB40_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 ; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc @@ -7453,7 +16950,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB20_4: ; %end +; VI-NEXT: .LBB40_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 @@ -7517,7 +17014,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -7537,9 +17034,9 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB20_2: ; %Flow +; GFX9-NEXT: .LBB40_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_4 +; GFX9-NEXT: s_cbranch_execz .LBB40_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc @@ -7577,7 +17074,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB20_4: ; %end +; GFX9-NEXT: .LBB40_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 @@ -7608,7 +17105,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -7633,7 +17130,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: .LBB40_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7662,7 +17159,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -7682,9 +17179,9 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB40_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -7727,7 +17224,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: .LBB40_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 @@ -7766,269 +17263,1030 @@ end: ret <36 x i16> %phi } +define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9i64_to_v36i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s28 +; SI-NEXT: v_mov_b32_e32 v4, s26 +; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s29, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s27, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s25, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s21, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s19, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s17, v9, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 16 +; SI-NEXT: s_lshr_b32 s11, s8, 16 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s27, 16 +; SI-NEXT: s_lshr_b32 s14, s25, 16 +; SI-NEXT: s_lshr_b32 s15, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s28 +; SI-NEXT: v_mov_b32_e32 v4, s26 +; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s29, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s27, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s25, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s21, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s19, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s17, v9, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 16 +; SI-NEXT: s_lshr_b32 s11, s8, 16 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s27, 16 +; SI-NEXT: s_lshr_b32 s14, s25, 16 +; SI-NEXT: s_lshr_b32 s15, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v9i64_to_v36i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_readfirstlane_b32 s9, v0 +; VI-NEXT: v_readfirstlane_b32 s8, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s8, 16 +; VI-NEXT: s_lshr_b32 s13, s9, 16 +; VI-NEXT: s_lshr_b32 s14, s29, 16 +; VI-NEXT: s_lshr_b32 s15, s28, 16 +; VI-NEXT: s_lshr_b32 s40, s27, 16 +; VI-NEXT: s_lshr_b32 s41, s26, 16 +; VI-NEXT: s_lshr_b32 s42, s25, 16 +; VI-NEXT: s_lshr_b32 s43, s24, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s8, 16 +; VI-NEXT: s_lshr_b32 s13, s9, 16 +; VI-NEXT: s_lshr_b32 s14, s29, 16 +; VI-NEXT: s_lshr_b32 s15, s28, 16 +; VI-NEXT: s_lshr_b32 s40, s27, 16 +; VI-NEXT: s_lshr_b32 s41, s26, 16 +; VI-NEXT: s_lshr_b32 s42, s25, 16 +; VI-NEXT: s_lshr_b32 s43, s24, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s59, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s58, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s57, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s56, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s47, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s46, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s45, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s44, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s43, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s42, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s41, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s40, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s15, s26, s15 +; VI-NEXT: s_and_b32 s26, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s14, s26, s14 +; VI-NEXT: s_or_b32 s9, s9, s13 +; VI-NEXT: s_or_b32 s8, s8, s12 +; VI-NEXT: s_or_b32 s6, s6, s11 +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s15 +; VI-NEXT: v_mov_b32_e32 v13, s14 +; VI-NEXT: v_mov_b32_e32 v14, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v9i64_to_v36i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_lshr_b32 s12, s7, 16 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshr_b32 s14, s29, 16 +; GFX9-NEXT: s_lshr_b32 s15, s28, 16 +; GFX9-NEXT: s_lshr_b32 s40, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s42, s25, 16 +; GFX9-NEXT: s_lshr_b32 s43, s24, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s56, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_lshr_b32 s12, s7, 16 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshr_b32 s14, s29, 16 +; GFX9-NEXT: s_lshr_b32 s15, s28, 16 +; GFX9-NEXT: s_lshr_b32 s40, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s42, s25, 16 +; GFX9-NEXT: s_lshr_b32 s43, s24, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s56, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s29, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s15 +; GFX9-NEXT: v_mov_b32_e32 v13, s14 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-TRUE16-LABEL: bitcast_v9i64_to_v36i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: .LBB41_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s18, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s19, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s18, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s19, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s18, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s19, s4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s15 :: v_dual_mov_b32 v7, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v9, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB41_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB41_2 +; +; GFX11-FAKE16-LABEL: bitcast_v9i64_to_v36i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-FAKE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-FAKE16-NEXT: .LBB41_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s20, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s24, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s25, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s26, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s27, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s28, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s29, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s15 :: v_dual_mov_b32 v7, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB41_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: s_branch .LBB41_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i64> %a, splat (i64 3) + %a2 = bitcast <9 x i64> %a1 to <36 x i16> + br label %end + +cmp.false: + %a3 = bitcast <9 x i64> %a to <36 x i16> + br label %end + +end: + %phi = phi <36 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x i16> %phi +} + define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v36i16_to_v9i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v40 -; GCN-NEXT: v_or_b32_e32 v1, v1, v41 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v18 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: v_or_b32_e32 v8, v8, v44 -; GCN-NEXT: v_or_b32_e32 v9, v9, v45 -; GCN-NEXT: v_or_b32_e32 v10, v10, v46 -; GCN-NEXT: v_or_b32_e32 v11, v11, v47 -; GCN-NEXT: v_or_b32_e32 v12, v12, v56 -; GCN-NEXT: v_or_b32_e32 v13, v13, v57 -; GCN-NEXT: v_or_b32_e32 v14, v14, v58 -; GCN-NEXT: v_or_b32_e32 v15, v15, v59 -; GCN-NEXT: v_or_b32_e32 v16, v16, v60 -; GCN-NEXT: v_or_b32_e32 v17, v17, v61 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v0, v40, v0 -; GCN-NEXT: v_or_b32_e32 v1, v41, v1 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v18, v4 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v18, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: v_or_b32_e32 v8, v44, v8 -; GCN-NEXT: v_or_b32_e32 v9, v45, v9 -; GCN-NEXT: v_or_b32_e32 v10, v46, v10 -; GCN-NEXT: v_or_b32_e32 v11, v47, v11 -; GCN-NEXT: v_or_b32_e32 v12, v56, v12 -; GCN-NEXT: v_or_b32_e32 v13, v57, v13 -; GCN-NEXT: v_or_b32_e32 v14, v58, v14 -; GCN-NEXT: v_or_b32_e32 v15, v59, v15 -; GCN-NEXT: v_or_b32_e32 v16, v60, v16 -; GCN-NEXT: v_or_b32_e32 v17, v61, v17 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36i16_to_v9i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v36, v22 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v40 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v62 +; SI-NEXT: v_or_b32_e32 v6, v6, v61 +; SI-NEXT: v_or_b32_e32 v7, v7, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v59 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v43 +; SI-NEXT: v_or_b32_e32 v17, v17, v42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v12, v12, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v46 +; SI-NEXT: v_or_b32_e32 v14, v14, v45 +; SI-NEXT: v_or_b32_e32 v15, v15, v44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v63, v4 +; SI-NEXT: v_or_b32_e32 v5, v62, v5 +; SI-NEXT: v_or_b32_e32 v6, v61, v6 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_or_b32_e32 v8, v59, v8 +; SI-NEXT: v_or_b32_e32 v9, v58, v9 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v11, v56, v11 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_or_b32_e32 v17, v42, v17 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v13, v46, v13 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v44, v15 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36i16_to_v9i64: ; VI: ; %bb.0: @@ -8057,7 +18315,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v17, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v17, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -8114,9 +18372,9 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB21_2: ; %Flow +; VI-NEXT: .LBB42_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_4 +; VI-NEXT: s_cbranch_execz .LBB42_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v41 @@ -8173,7 +18431,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v18, 3, v32 ; VI-NEXT: v_add_u16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: .LBB21_4: ; %end +; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -8243,7 +18501,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; kill: killed $vgpr18 @@ -8308,9 +18566,9 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: .LBB21_2: ; %Flow +; GFX9-NEXT: .LBB42_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -8357,7 +18615,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_4: ; %end +; GFX9-NEXT: .LBB42_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -8386,7 +18644,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -8406,7 +18664,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8454,7 +18712,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -8474,7 +18732,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8494,337 +18752,1115 @@ end: ret <9 x i64> %phi } +define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36i16_to_v9i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v20 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v34, v16 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v8 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v4 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v21 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v7, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v8, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v16, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v17, v0, v51 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v36i16_to_v9i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v32, v3 +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v36i16_to_v9i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v3 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_mov_b32_e32 v35, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v9i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s54, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s55, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s64, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s65, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s66, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s67, 15 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s36, s40, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s37, s41, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s48, s56, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s49, s57, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s38, s40, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s39, s41, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s50, s56, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s51, s57, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s40, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s41, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s42, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s43, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s44, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s45, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s46, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s52, s56, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s53, s57, s4 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s74 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_branch .LBB43_5 +; GFX11-TRUE16-NEXT: .LBB43_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-TRUE16-NEXT: .LBB43_5: ; %end +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v32, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v32, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v32, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v32, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v32, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v32, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v9i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_3: +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <36 x i16> %a, splat (i16 3) + %a2 = bitcast <36 x i16> %a1 to <9 x i64> + br label %end + +cmp.false: + %a3 = bitcast <36 x i16> %a to <9 x i64> + br label %end + +end: + %phi = phi <9 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i64> %phi +} + define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v9i64_to_v36f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v42 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v53 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v51 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v49 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v39 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v37 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v41, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v8, v7 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v10, v35, v37 -; GCN-NEXT: v_or_b32_e32 v11, v33, v49 -; GCN-NEXT: v_or_b32_e32 v13, v31, v52 -; GCN-NEXT: v_or_b32_e32 v14, v29, v50 -; GCN-NEXT: v_or_b32_e32 v16, v27, v48 -; GCN-NEXT: v_or_b32_e32 v17, v25, v38 -; GCN-NEXT: v_or_b32_e32 v24, v24, v36 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v21, v21, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: v_or_b32_e32 v19, v19, v26 -; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9i64_to_v36f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i64_to_v36f16: ; VI: ; %bb.0: @@ -8850,7 +19886,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -8870,9 +19906,9 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB22_2: ; %Flow +; VI-NEXT: .LBB44_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_4 +; VI-NEXT: s_cbranch_execz .LBB44_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 ; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc @@ -8910,7 +19946,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB22_4: ; %end +; VI-NEXT: .LBB44_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 @@ -8974,7 +20010,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -8994,9 +20030,9 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB22_2: ; %Flow +; GFX9-NEXT: .LBB44_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_4 +; GFX9-NEXT: s_cbranch_execz .LBB44_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc @@ -9034,7 +20070,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB22_4: ; %end +; GFX9-NEXT: .LBB44_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 @@ -9065,7 +20101,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9090,7 +20126,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: .LBB44_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9119,7 +20155,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -9139,9 +20175,9 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB44_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9184,7 +20220,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: .LBB44_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 @@ -9223,345 +20259,1213 @@ end: ret <36 x half> %phi } +define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9i64_to_v36f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s10, s4, 16 +; SI-NEXT: s_lshr_b32 s11, s5, 16 +; SI-NEXT: s_add_u32 s12, s18, 3 +; SI-NEXT: s_addc_u32 s13, s19, 0 +; SI-NEXT: s_lshr_b32 s14, s12, 16 +; SI-NEXT: s_lshr_b32 s15, s13, 16 +; SI-NEXT: s_add_u32 s16, s20, 3 +; SI-NEXT: s_addc_u32 s17, s21, 0 +; SI-NEXT: s_lshr_b32 s18, s16, 16 +; SI-NEXT: s_lshr_b32 s19, s17, 16 +; SI-NEXT: s_add_u32 s20, s22, 3 +; SI-NEXT: s_addc_u32 s21, s23, 0 +; SI-NEXT: s_lshr_b32 s22, s20, 16 +; SI-NEXT: s_lshr_b32 s23, s21, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s40, s24, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s42, s26, 16 +; SI-NEXT: s_lshr_b32 s43, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s44, s28, 16 +; SI-NEXT: s_lshr_b32 s45, s29, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s46, s7, 16 +; SI-NEXT: s_lshr_b32 s47, s8, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_lshr_b32 s56, s6, 16 +; SI-NEXT: s_lshr_b32 s57, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s10 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v9i64_to_v36f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_readfirstlane_b32 s9, v0 +; VI-NEXT: v_readfirstlane_b32 s8, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s8, 16 +; VI-NEXT: s_lshr_b32 s13, s9, 16 +; VI-NEXT: s_lshr_b32 s14, s29, 16 +; VI-NEXT: s_lshr_b32 s15, s28, 16 +; VI-NEXT: s_lshr_b32 s40, s27, 16 +; VI-NEXT: s_lshr_b32 s41, s26, 16 +; VI-NEXT: s_lshr_b32 s42, s25, 16 +; VI-NEXT: s_lshr_b32 s43, s24, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s8, 16 +; VI-NEXT: s_lshr_b32 s13, s9, 16 +; VI-NEXT: s_lshr_b32 s14, s29, 16 +; VI-NEXT: s_lshr_b32 s15, s28, 16 +; VI-NEXT: s_lshr_b32 s40, s27, 16 +; VI-NEXT: s_lshr_b32 s41, s26, 16 +; VI-NEXT: s_lshr_b32 s42, s25, 16 +; VI-NEXT: s_lshr_b32 s43, s24, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s59, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s58, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s57, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s56, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s47, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s46, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s45, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s44, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s43, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s42, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s41, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s40, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s15, s26, s15 +; VI-NEXT: s_and_b32 s26, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s14, s26, s14 +; VI-NEXT: s_or_b32 s9, s9, s13 +; VI-NEXT: s_or_b32 s8, s8, s12 +; VI-NEXT: s_or_b32 s6, s6, s11 +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s15 +; VI-NEXT: v_mov_b32_e32 v13, s14 +; VI-NEXT: v_mov_b32_e32 v14, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v9i64_to_v36f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_lshr_b32 s12, s7, 16 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshr_b32 s14, s29, 16 +; GFX9-NEXT: s_lshr_b32 s15, s28, 16 +; GFX9-NEXT: s_lshr_b32 s40, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s42, s25, 16 +; GFX9-NEXT: s_lshr_b32 s43, s24, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s56, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_lshr_b32 s12, s7, 16 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshr_b32 s14, s29, 16 +; GFX9-NEXT: s_lshr_b32 s15, s28, 16 +; GFX9-NEXT: s_lshr_b32 s40, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s42, s25, 16 +; GFX9-NEXT: s_lshr_b32 s43, s24, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s56, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s29, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s15 +; GFX9-NEXT: v_mov_b32_e32 v13, s14 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-TRUE16-LABEL: bitcast_v9i64_to_v36f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: .LBB45_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s18, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s19, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s18, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s19, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s18, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s19, s4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s15 :: v_dual_mov_b32 v7, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v9, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB45_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB45_2 +; +; GFX11-FAKE16-LABEL: bitcast_v9i64_to_v36f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-FAKE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-FAKE16-NEXT: .LBB45_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s20, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s24, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s25, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s26, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s27, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s28, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s29, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s15 :: v_dual_mov_b32 v7, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB45_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: s_branch .LBB45_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i64> %a, splat (i64 3) + %a2 = bitcast <9 x i64> %a1 to <36 x half> + br label %end + +cmp.false: + %a3 = bitcast <9 x i64> %a to <36 x half> + br label %end + +end: + %phi = phi <36 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x half> %phi +} + define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v36f16_to_v9i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v32, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; GCN-NEXT: v_or_b32_e32 v3, v60, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v39 -; GCN-NEXT: v_or_b32_e32 v4, v46, v4 -; GCN-NEXT: v_or_b32_e32 v5, v44, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v40, v7 -; GCN-NEXT: v_or_b32_e32 v8, v54, v8 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: v_or_b32_e32 v10, v51, v10 -; GCN-NEXT: v_or_b32_e32 v11, v48, v11 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v18, v12 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v18, v14 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v18, v15 -; GCN-NEXT: v_or_b32_e32 v16, v37, v16 -; GCN-NEXT: v_or_b32_e32 v17, v36, v17 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v62 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v60 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v55 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v53 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v50 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v49 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v36 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_or_b32_e32 v6, v9, v8 -; GCN-NEXT: v_or_b32_e32 v7, v11, v10 -; GCN-NEXT: v_or_b32_e32 v8, v13, v12 -; GCN-NEXT: v_or_b32_e32 v9, v15, v14 -; GCN-NEXT: v_or_b32_e32 v10, v17, v16 -; GCN-NEXT: v_or_b32_e32 v11, v19, v18 -; GCN-NEXT: v_or_b32_e32 v12, v21, v20 -; GCN-NEXT: v_or_b32_e32 v13, v23, v22 -; GCN-NEXT: v_or_b32_e32 v14, v25, v24 -; GCN-NEXT: v_or_b32_e32 v15, v27, v26 -; GCN-NEXT: v_or_b32_e32 v16, v29, v28 -; GCN-NEXT: v_or_b32_e32 v17, v31, v30 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36f16_to_v9i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v62, v2 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_or_b32_e32 v4, v58, v4 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_or_b32_e32 v7, v44, v7 +; SI-NEXT: v_or_b32_e32 v8, v42, v8 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v52, v11 +; SI-NEXT: v_or_b32_e32 v12, v50, v12 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; SI-NEXT: v_or_b32_e32 v17, v36, v17 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36f16_to_v9i64: ; VI: ; %bb.0: @@ -9590,7 +21494,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v17, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v17, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -9647,9 +21551,9 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB23_2: ; %Flow +; VI-NEXT: .LBB46_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_4 +; VI-NEXT: s_cbranch_execz .LBB46_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v41, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -9706,7 +21610,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v18, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: .LBB23_4: ; %end +; VI-NEXT: .LBB46_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -9776,7 +21680,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; kill: killed $vgpr18 @@ -9841,9 +21745,9 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: .LBB23_2: ; %Flow +; GFX9-NEXT: .LBB46_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -9891,7 +21795,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_4: ; %end +; GFX9-NEXT: .LBB46_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -9920,7 +21824,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -9940,7 +21844,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9988,7 +21892,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -10008,7 +21912,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -10028,175 +21932,1067 @@ end: ret <9 x i64> %phi } +define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36f16_to_v9i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v41, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v3, v63, v3 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v39, v10 +; SI-NEXT: v_or_b32_e32 v11, v37, v11 +; SI-NEXT: v_or_b32_e32 v12, v31, v12 +; SI-NEXT: v_or_b32_e32 v13, v29, v13 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_or_b32_e32 v15, v25, v15 +; SI-NEXT: v_or_b32_e32 v16, v23, v16 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v27 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v38, v23 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v39 +; SI-NEXT: v_mov_b32_e32 v39, v24 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v25 +; SI-NEXT: v_mov_b32_e32 v32, v44 +; SI-NEXT: v_mov_b32_e32 v44, v49 +; SI-NEXT: v_mov_b32_e32 v49, v26 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: v_mov_b32_e32 v50, v27 +; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v51, v28 +; SI-NEXT: v_mov_b32_e32 v52, v29 +; SI-NEXT: v_mov_b32_e32 v53, v30 +; SI-NEXT: v_mov_b32_e32 v54, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v24, v39 +; SI-NEXT: v_mov_b32_e32 v39, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v38 +; SI-NEXT: v_mov_b32_e32 v38, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v18, v37 +; SI-NEXT: v_mov_b32_e32 v37, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v19, v36 +; SI-NEXT: v_mov_b32_e32 v36, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v54 +; SI-NEXT: v_mov_b32_e32 v30, v53 +; SI-NEXT: v_mov_b32_e32 v29, v52 +; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v27, v50 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v26, v49 +; SI-NEXT: v_mov_b32_e32 v49, v44 +; SI-NEXT: v_mov_b32_e32 v44, v32 +; SI-NEXT: v_mov_b32_e32 v25, v48 +; SI-NEXT: v_mov_b32_e32 v48, v43 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v36f16_to_v9i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v32, v3 +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB47_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v17, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v35, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v16, v18, v16 +; VI-NEXT: v_add_f16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v36f16_to_v9i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v3 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_mov_b32_e32 v35, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB47_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB47_2 +; +; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v9i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s54, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s55, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s64, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s65, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s66, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s67, 15 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s36, s40, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s37, s41, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s48, s56, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s49, s57, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s38, s40, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s39, s41, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s50, s56, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s51, s57, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s40, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s41, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s42, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s43, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s44, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s45, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s46, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s52, s56, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s53, s57, s4 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s74 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_branch .LBB47_5 +; GFX11-TRUE16-NEXT: .LBB47_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-TRUE16-NEXT: .LBB47_5: ; %end +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v32, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v32, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v32, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v32, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v32, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v32, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v9i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB47_3: +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <36 x half> %a, splat (half 0xH0200) + %a2 = bitcast <36 x half> %a1 to <9 x i64> + br label %end + +cmp.false: + %a3 = bitcast <36 x half> %a to <9 x i64> + br label %end + +end: + %phi = phi <9 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i64> %phi +} + define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v9f64_to_v36i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v23, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v24, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v26, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v28, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v31, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v23, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v24, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v26, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v28, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v31, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v1, v1, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v36 -; GCN-NEXT: v_or_b32_e32 v2, v2, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 4, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v3, v3, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 8, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v4, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v5, v5, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v6, v6, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 20, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v7, v7, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 24, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v8, v8, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 28, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v9, v9, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v10, v10, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 40, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v12, v12, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v13, v13, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 52, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v15, v15, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v16, v16, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v17, v17, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v18, v18, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v25, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9f64_to_v36i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v24, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v34, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v24, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v34, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f64_to_v36i16: ; VI: ; %bb.0: @@ -10222,7 +23018,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -10242,9 +23038,9 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -10273,7 +23069,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 @@ -10337,7 +23133,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -10357,9 +23153,9 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -10388,7 +23184,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 @@ -10419,7 +23215,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -10430,7 +23226,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: .LBB48_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10459,7 +23255,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -10479,9 +23275,9 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -10510,7 +23306,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 @@ -10549,269 +23345,1033 @@ end: ret <36 x i16> %phi } +define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9f64_to_v36i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_mov_b32_e32 v8, s26 +; SI-NEXT: v_mov_b32_e32 v9, s27 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v7, s29 +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v5, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v20, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v21, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v29, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v31, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v19, v18, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_alignbit_b32 v5, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v20, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v21, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v29, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v31, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v19, v18, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v34 +; SI-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v16, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v29 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v12, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v9f64_to_v36i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v18, s16 +; VI-NEXT: v_mov_b32_e32 v19, s17 +; VI-NEXT: v_mov_b32_e32 v16, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_or_b32_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_or_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v7, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_or_b32_sdwa v8, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; VI-NEXT: v_or_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; VI-NEXT: v_or_b32_sdwa v21, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: v_or_b32_sdwa v18, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v34 +; VI-NEXT: v_or_b32_sdwa v5, v15, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_or_b32_sdwa v19, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v20 +; VI-NEXT: v_mov_b32_e32 v1, v21 +; VI-NEXT: v_mov_b32_e32 v2, v18 +; VI-NEXT: v_mov_b32_e32 v3, v19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v9f64_to_v36i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v16, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v14, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v25, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v35, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v15, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v19, v34, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v16, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v32, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v31, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v30, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v29, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v27, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v26, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v17, v22, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v20 +; GFX9-NEXT: v_mov_b32_e32 v1, v21 +; GFX9-NEXT: v_mov_b32_e32 v2, v18 +; GFX9-NEXT: v_mov_b32_e32 v3, v19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: s_branch .LBB49_2 +; +; GFX11-TRUE16-LABEL: bitcast_v9f64_to_v36i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], s[28:29], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], s[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[17:18], s[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], s[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], s[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[3:4], s[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[5:6], s[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], s[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0 +; GFX11-TRUE16-NEXT: s_branch .LBB49_5 +; GFX11-TRUE16-NEXT: .LBB49_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s5 +; GFX11-TRUE16-NEXT: .LBB49_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v34, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v33, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v32, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v7, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v26, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v12, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v35, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v31, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v22, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v24, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v23, 16, v19 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v9f64_to_v36i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], s[28:29], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], s[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], s[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], s[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], s[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], s[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], s[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], s[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0 +; GFX11-FAKE16-NEXT: s_branch .LBB49_5 +; GFX11-FAKE16-NEXT: .LBB49_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v19, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v10, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v8, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v20, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v6, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v4, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v18, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v16, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v14, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v7, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v27, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v21, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v35, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v33, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s15 :: v_dual_mov_b32 v31, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v29, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v25, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v23, s5 +; GFX11-FAKE16-NEXT: .LBB49_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v34, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v33, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v32, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v7, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v26, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v35, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v31, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v25, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v22, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v23, 16, v19 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <9 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <9 x double> %a1 to <36 x i16> + br label %end + +cmp.false: + %a3 = bitcast <9 x double> %a to <36 x i16> + br label %end + +end: + %phi = phi <36 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x i16> %phi +} + define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v36i16_to_v9f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v40 -; GCN-NEXT: v_or_b32_e32 v1, v1, v41 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v18 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: v_or_b32_e32 v8, v8, v44 -; GCN-NEXT: v_or_b32_e32 v9, v9, v45 -; GCN-NEXT: v_or_b32_e32 v10, v10, v46 -; GCN-NEXT: v_or_b32_e32 v11, v11, v47 -; GCN-NEXT: v_or_b32_e32 v12, v12, v56 -; GCN-NEXT: v_or_b32_e32 v13, v13, v57 -; GCN-NEXT: v_or_b32_e32 v14, v14, v58 -; GCN-NEXT: v_or_b32_e32 v15, v15, v59 -; GCN-NEXT: v_or_b32_e32 v16, v16, v60 -; GCN-NEXT: v_or_b32_e32 v17, v17, v61 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v0, v40, v0 -; GCN-NEXT: v_or_b32_e32 v1, v41, v1 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v18, v4 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v18, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: v_or_b32_e32 v8, v44, v8 -; GCN-NEXT: v_or_b32_e32 v9, v45, v9 -; GCN-NEXT: v_or_b32_e32 v10, v46, v10 -; GCN-NEXT: v_or_b32_e32 v11, v47, v11 -; GCN-NEXT: v_or_b32_e32 v12, v56, v12 -; GCN-NEXT: v_or_b32_e32 v13, v57, v13 -; GCN-NEXT: v_or_b32_e32 v14, v58, v14 -; GCN-NEXT: v_or_b32_e32 v15, v59, v15 -; GCN-NEXT: v_or_b32_e32 v16, v60, v16 -; GCN-NEXT: v_or_b32_e32 v17, v61, v17 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36i16_to_v9f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v36, v22 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v40 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v62 +; SI-NEXT: v_or_b32_e32 v6, v6, v61 +; SI-NEXT: v_or_b32_e32 v7, v7, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v59 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v43 +; SI-NEXT: v_or_b32_e32 v17, v17, v42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v12, v12, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v46 +; SI-NEXT: v_or_b32_e32 v14, v14, v45 +; SI-NEXT: v_or_b32_e32 v15, v15, v44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v63, v4 +; SI-NEXT: v_or_b32_e32 v5, v62, v5 +; SI-NEXT: v_or_b32_e32 v6, v61, v6 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_or_b32_e32 v8, v59, v8 +; SI-NEXT: v_or_b32_e32 v9, v58, v9 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v11, v56, v11 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_or_b32_e32 v17, v42, v17 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v13, v46, v13 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v44, v15 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36i16_to_v9f64: ; VI: ; %bb.0: @@ -10840,7 +24400,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v17, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v17, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -10897,9 +24457,9 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v41 @@ -10956,7 +24516,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v18, 3, v32 ; VI-NEXT: v_add_u16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -11026,7 +24586,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; kill: killed $vgpr18 @@ -11091,9 +24651,9 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: .LBB25_2: ; %Flow +; GFX9-NEXT: .LBB50_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 +; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -11140,7 +24700,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: .LBB50_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -11169,7 +24729,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -11189,7 +24749,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11237,7 +24797,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -11257,7 +24817,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -11277,319 +24837,1097 @@ end: ret <9 x double> %phi } +define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36i16_to_v9f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v20 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v34, v16 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v8 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v4 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v21 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v7, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v8, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v16, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v17, v0, v51 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v36i16_to_v9f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v32, v3 +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v36i16_to_v9f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v3 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_mov_b32_e32 v35, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v9f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s54, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s55, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s64, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s65, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s66, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s67, 15 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s36, s40, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s37, s41, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s48, s56, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s49, s57, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s38, s40, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s39, s41, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s50, s56, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s51, s57, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s40, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s41, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s42, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s43, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s44, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s45, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s46, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s52, s56, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s53, s57, s4 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_4 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s74 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_branch .LBB51_5 +; GFX11-TRUE16-NEXT: .LBB51_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-TRUE16-NEXT: .LBB51_5: ; %end +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v32, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v32, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v32, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v32, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v32, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v32, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v9f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_4 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_3: +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <36 x i16> %a, splat (i16 3) + %a2 = bitcast <36 x i16> %a1 to <9 x double> + br label %end + +cmp.false: + %a3 = bitcast <36 x i16> %a to <9 x double> + br label %end + +end: + %phi = phi <9 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x double> %phi +} + define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v9f64_to_v36f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v42 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v53 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v51 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v49 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v39 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v37 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v41, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v8, v7 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v10, v35, v37 -; GCN-NEXT: v_or_b32_e32 v11, v33, v49 -; GCN-NEXT: v_or_b32_e32 v13, v31, v52 -; GCN-NEXT: v_or_b32_e32 v14, v29, v50 -; GCN-NEXT: v_or_b32_e32 v16, v27, v48 -; GCN-NEXT: v_or_b32_e32 v17, v25, v38 -; GCN-NEXT: v_or_b32_e32 v24, v24, v36 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v21, v21, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: v_or_b32_e32 v19, v19, v26 -; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9f64_to_v36f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f64_to_v36f16: ; VI: ; %bb.0: @@ -11615,7 +25953,7 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -11635,9 +25973,9 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB26_2: ; %Flow +; VI-NEXT: .LBB52_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_4 +; VI-NEXT: s_cbranch_execz .LBB52_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -11666,7 +26004,7 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB26_4: ; %end +; VI-NEXT: .LBB52_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 @@ -11730,7 +26068,7 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -11750,9 +26088,9 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB26_2: ; %Flow +; GFX9-NEXT: .LBB52_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_4 +; GFX9-NEXT: s_cbranch_execz .LBB52_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -11781,7 +26119,7 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB26_4: ; %end +; GFX9-NEXT: .LBB52_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 @@ -11812,7 +26150,7 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -11823,7 +26161,7 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11852,7 +26190,7 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -11872,9 +26210,9 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -11899,31 +26237,894 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB52_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <9 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <9 x double> %a1 to <36 x half> + br label %end + +cmp.false: + %a3 = bitcast <9 x double> %a to <36 x half> + br label %end + +end: + %phi = phi <36 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x half> %phi +} + +define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9f64_to_v36f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: v_readfirstlane_b32 s7, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: s_and_b64 s[8:9], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v4 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s8, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: s_lshr_b32 s8, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: s_lshr_b32 s8, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: s_lshr_b32 s8, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 +; SI-NEXT: s_lshr_b32 s8, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: s_lshr_b32 s8, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 +; SI-NEXT: s_lshr_b32 s8, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s8 +; SI-NEXT: s_lshr_b32 s8, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s8 +; SI-NEXT: s_lshr_b32 s8, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s8 +; SI-NEXT: s_lshr_b32 s8, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s8 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s8 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s8 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 +; SI-NEXT: s_lshr_b32 s8, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[28:29], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[25:26], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[21:22], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[17:18], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[13:14], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v9f64_to_v36f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v18, s16 +; VI-NEXT: v_mov_b32_e32 v19, s17 +; VI-NEXT: v_mov_b32_e32 v16, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_or_b32_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_or_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v7, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_or_b32_sdwa v8, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; VI-NEXT: v_or_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; VI-NEXT: v_or_b32_sdwa v21, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: v_or_b32_sdwa v18, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v34 +; VI-NEXT: v_or_b32_sdwa v5, v15, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_or_b32_sdwa v19, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v20 +; VI-NEXT: v_mov_b32_e32 v1, v21 +; VI-NEXT: v_mov_b32_e32 v2, v18 +; VI-NEXT: v_mov_b32_e32 v3, v19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v9f64_to_v36f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v16, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v14, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v25, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v35, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v15, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v19, v34, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v16, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v32, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v31, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v30, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v29, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v27, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v26, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v17, v22, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v20 +; GFX9-NEXT: v_mov_b32_e32 v1, v21 +; GFX9-NEXT: v_mov_b32_e32 v2, v18 +; GFX9-NEXT: v_mov_b32_e32 v3, v19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-TRUE16-LABEL: bitcast_v9f64_to_v36f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_4 +; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], s[28:29], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], s[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[17:18], s[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], s[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], s[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[3:4], s[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[5:6], s[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], s[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0 +; GFX11-TRUE16-NEXT: s_branch .LBB53_5 +; GFX11-TRUE16-NEXT: .LBB53_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB53_2 +; GFX11-TRUE16-NEXT: .LBB53_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s5 +; GFX11-TRUE16-NEXT: .LBB53_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v34, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v33, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v32, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v7, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v26, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v12, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v35, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v31, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v22, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v24, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v23, 16, v19 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v9f64_to_v36f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_4 +; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], s[28:29], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], s[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], s[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], s[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], s[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], s[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], s[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], s[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0 +; GFX11-FAKE16-NEXT: s_branch .LBB53_5 +; GFX11-FAKE16-NEXT: .LBB53_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: s_branch .LBB53_2 +; GFX11-FAKE16-NEXT: .LBB53_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v19, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v10, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v8, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v20, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v6, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v4, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v18, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v16, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v14, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v7, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v27, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v21, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v35, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v33, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s15 :: v_dual_mov_b32 v31, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v29, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v25, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v23, s5 +; GFX11-FAKE16-NEXT: .LBB53_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v34, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v33, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v32, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v7, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v26, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v35, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v31, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v25, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v22, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v23, 16, v19 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -11943,344 +27144,361 @@ end: } define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v36f16_to_v9f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v32, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; GCN-NEXT: v_or_b32_e32 v3, v60, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v39 -; GCN-NEXT: v_or_b32_e32 v4, v46, v4 -; GCN-NEXT: v_or_b32_e32 v5, v44, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v40, v7 -; GCN-NEXT: v_or_b32_e32 v8, v54, v8 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: v_or_b32_e32 v10, v51, v10 -; GCN-NEXT: v_or_b32_e32 v11, v48, v11 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v18, v12 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v18, v14 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v18, v15 -; GCN-NEXT: v_or_b32_e32 v16, v37, v16 -; GCN-NEXT: v_or_b32_e32 v17, v36, v17 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v62 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v60 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v55 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v53 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v50 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v49 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v36 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_or_b32_e32 v6, v9, v8 -; GCN-NEXT: v_or_b32_e32 v7, v11, v10 -; GCN-NEXT: v_or_b32_e32 v8, v13, v12 -; GCN-NEXT: v_or_b32_e32 v9, v15, v14 -; GCN-NEXT: v_or_b32_e32 v10, v17, v16 -; GCN-NEXT: v_or_b32_e32 v11, v19, v18 -; GCN-NEXT: v_or_b32_e32 v12, v21, v20 -; GCN-NEXT: v_or_b32_e32 v13, v23, v22 -; GCN-NEXT: v_or_b32_e32 v14, v25, v24 -; GCN-NEXT: v_or_b32_e32 v15, v27, v26 -; GCN-NEXT: v_or_b32_e32 v16, v29, v28 -; GCN-NEXT: v_or_b32_e32 v17, v31, v30 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36f16_to_v9f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v62, v2 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_or_b32_e32 v4, v58, v4 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_or_b32_e32 v7, v44, v7 +; SI-NEXT: v_or_b32_e32 v8, v42, v8 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v52, v11 +; SI-NEXT: v_or_b32_e32 v12, v50, v12 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; SI-NEXT: v_or_b32_e32 v17, v36, v17 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36f16_to_v9f64: ; VI: ; %bb.0: @@ -12309,7 +27527,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v17, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v17, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -12366,9 +27584,9 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: .LBB54_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_4 +; VI-NEXT: s_cbranch_execz .LBB54_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v41, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -12425,7 +27643,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v18, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: .LBB27_4: ; %end +; VI-NEXT: .LBB54_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -12495,7 +27713,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; kill: killed $vgpr18 @@ -12560,9 +27778,9 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: .LBB54_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -12610,7 +27828,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_4: ; %end +; GFX9-NEXT: .LBB54_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -12639,7 +27857,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -12659,7 +27877,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: .LBB54_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12707,7 +27925,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -12727,7 +27945,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB27_2: ; %end +; GFX11-FAKE16-NEXT: .LBB54_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -12747,420 +27965,1308 @@ end: ret <9 x double> %phi } +define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36f16_to_v9f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v41, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v3, v63, v3 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v39, v10 +; SI-NEXT: v_or_b32_e32 v11, v37, v11 +; SI-NEXT: v_or_b32_e32 v12, v31, v12 +; SI-NEXT: v_or_b32_e32 v13, v29, v13 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_or_b32_e32 v15, v25, v15 +; SI-NEXT: v_or_b32_e32 v16, v23, v16 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v27 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v38, v23 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v39 +; SI-NEXT: v_mov_b32_e32 v39, v24 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v25 +; SI-NEXT: v_mov_b32_e32 v32, v44 +; SI-NEXT: v_mov_b32_e32 v44, v49 +; SI-NEXT: v_mov_b32_e32 v49, v26 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: v_mov_b32_e32 v50, v27 +; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v51, v28 +; SI-NEXT: v_mov_b32_e32 v52, v29 +; SI-NEXT: v_mov_b32_e32 v53, v30 +; SI-NEXT: v_mov_b32_e32 v54, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v24, v39 +; SI-NEXT: v_mov_b32_e32 v39, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v38 +; SI-NEXT: v_mov_b32_e32 v38, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v18, v37 +; SI-NEXT: v_mov_b32_e32 v37, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v19, v36 +; SI-NEXT: v_mov_b32_e32 v36, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v54 +; SI-NEXT: v_mov_b32_e32 v30, v53 +; SI-NEXT: v_mov_b32_e32 v29, v52 +; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v27, v50 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v26, v49 +; SI-NEXT: v_mov_b32_e32 v49, v44 +; SI-NEXT: v_mov_b32_e32 v44, v32 +; SI-NEXT: v_mov_b32_e32 v25, v48 +; SI-NEXT: v_mov_b32_e32 v48, v43 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v36f16_to_v9f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v32, v3 +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v17, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v35, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v16, v18, v16 +; VI-NEXT: v_add_f16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v36f16_to_v9f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v3 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_mov_b32_e32 v35, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v9f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s54, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s55, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s64, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s65, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s66, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s67, 15 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s36, s40, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s37, s41, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s48, s56, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s49, s57, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s38, s40, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s39, s41, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s50, s56, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s51, s57, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s40, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s41, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s42, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s43, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s44, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s45, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s46, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s52, s56, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s53, s57, s4 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s74 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_branch .LBB55_5 +; GFX11-TRUE16-NEXT: .LBB55_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GFX11-TRUE16-NEXT: s_branch .LBB55_2 +; GFX11-TRUE16-NEXT: .LBB55_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-TRUE16-NEXT: .LBB55_5: ; %end +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v32, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v32, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v32, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v32, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v32, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v32, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v9f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB55_3: +; GFX11-FAKE16-NEXT: s_branch .LBB55_2 +; GFX11-FAKE16-NEXT: .LBB55_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <36 x half> %a, splat (half 0xH0200) + %a2 = bitcast <36 x half> %a1 to <9 x double> + br label %end + +cmp.false: + %a3 = bitcast <36 x half> %a to <9 x double> + br label %end + +end: + %phi = phi <9 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x double> %phi +} + define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v36i16_to_v36f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v30 -; GCN-NEXT: s_waitcnt vmcnt(9) -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v39 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v39 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v37 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v35 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v41 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v42 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v43 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v44 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v45 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v46 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v47 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v56 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v57 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v48 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v49 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v50 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v51 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v8, v7 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v10, v20, v19 -; GCN-NEXT: v_or_b32_e32 v11, v23, v22 -; GCN-NEXT: v_or_b32_e32 v13, v26, v25 -; GCN-NEXT: v_or_b32_e32 v14, v29, v28 -; GCN-NEXT: v_or_b32_e32 v16, v35, v34 -; GCN-NEXT: v_or_b32_e32 v17, v38, v37 -; GCN-NEXT: v_or_b32_e32 v19, v49, v48 -; GCN-NEXT: v_or_b32_e32 v20, v52, v51 -; GCN-NEXT: v_or_b32_e32 v22, v53, v42 -; GCN-NEXT: v_or_b32_e32 v23, v54, v31 -; GCN-NEXT: v_or_b32_e32 v25, v55, v32 -; GCN-NEXT: v_or_b32_e32 v26, v40, v33 -; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36i16_to_v36f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36i16_to_v36f16: ; VI: ; %bb.0: @@ -13187,7 +29293,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v35, 3, v35 @@ -13225,7 +29331,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v20, 3, v20 ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: v_add_u16_e32 v19, 3, v19 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -13290,7 +29396,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v17, v35, v17, s6 @@ -13347,7 +29453,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v18, v0, s4 @@ -13378,7 +29484,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] @@ -13398,7 +29504,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: .LBB56_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13428,7 +29534,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v17, v36, v17, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 @@ -13484,7 +29590,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: .LBB56_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v19, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v20, v1, 0x5040100 @@ -13522,323 +29628,1274 @@ end: ret <36 x half> %phi } +define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36i16_to_v36f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v46, s16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v46, s16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v36i16_to_v36f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s15, s18, s15 +; VI-NEXT: s_and_b32 s18, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s14, s18, s14 +; VI-NEXT: s_and_b32 s18, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s13, s18, s13 +; VI-NEXT: s_and_b32 s18, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s12, s18, s12 +; VI-NEXT: s_and_b32 s18, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s11, s18, s11 +; VI-NEXT: s_and_b32 s18, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s10, s18, s10 +; VI-NEXT: s_and_b32 s18, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s9, s18, s9 +; VI-NEXT: s_and_b32 s18, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_or_b32 s8, s18, s8 +; VI-NEXT: s_and_b32 s18, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; VI-NEXT: s_or_b32 s7, s18, s7 +; VI-NEXT: s_and_b32 s18, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; VI-NEXT: s_or_b32 s6, s18, s6 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v36i16_to_v36f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_pk_add_u16 v13, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_u16 v12, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_u16 v11, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_u16 v10, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_u16 v9, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_u16 v8, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_u16 v7, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_u16 v6, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_u16 v5, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_u16 v19, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pk_add_u16 v18, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v21, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: s_branch .LBB57_5 +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s18 +; GFX9-NEXT: v_mov_b32_e32 v21, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s43 +; GFX9-NEXT: v_mov_b32_e32 v23, s42 +; GFX9-NEXT: v_mov_b32_e32 v24, s41 +; GFX9-NEXT: v_mov_b32_e32 v25, s40 +; GFX9-NEXT: v_mov_b32_e32 v26, s15 +; GFX9-NEXT: v_mov_b32_e32 v27, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s13 +; GFX9-NEXT: v_mov_b32_e32 v29, s12 +; GFX9-NEXT: v_mov_b32_e32 v30, s11 +; GFX9-NEXT: v_mov_b32_e32 v31, s10 +; GFX9-NEXT: v_mov_b32_e32 v32, s9 +; GFX9-NEXT: v_mov_b32_e32 v33, s8 +; GFX9-NEXT: v_mov_b32_e32 v34, s7 +; GFX9-NEXT: v_mov_b32_e32 v35, s6 +; GFX9-NEXT: .LBB57_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v20, v35, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v34, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v18, v33, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v4, v31, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v29, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v28, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v27, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v26, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v23, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v22, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v20 +; GFX9-NEXT: v_mov_b32_e32 v1, v21 +; GFX9-NEXT: v_mov_b32_e32 v2, v18 +; GFX9-NEXT: v_mov_b32_e32 v3, v19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v36f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s14, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s13, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s12, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s11, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s10, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-TRUE16-NEXT: s_branch .LBB57_5 +; GFX11-TRUE16-NEXT: .LBB57_3: +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s5 +; GFX11-TRUE16-NEXT: .LBB57_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v28, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v29, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v23, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v24, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v27, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v18, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v24 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v36f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GFX11-FAKE16-NEXT: s_branch .LBB57_5 +; GFX11-FAKE16-NEXT: .LBB57_3: +; GFX11-FAKE16-NEXT: s_branch .LBB57_2 +; GFX11-FAKE16-NEXT: .LBB57_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v8, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v10, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v4, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v0, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5 +; GFX11-FAKE16-NEXT: .LBB57_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <36 x i16> %a, splat (i16 3) + %a2 = bitcast <36 x i16> %a1 to <36 x half> + br label %end + +cmp.false: + %a3 = bitcast <36 x i16> %a to <36 x half> + br label %end + +end: + %phi = phi <36 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x half> %phi +} + define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v36f16_to_v36i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v30 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v55 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v44 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v4 -; GCN-NEXT: v_or_b32_e32 v3, v3, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v6 -; GCN-NEXT: v_or_b32_e32 v5, v5, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v12 -; GCN-NEXT: v_or_b32_e32 v11, v11, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v31 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v32 -; GCN-NEXT: v_or_b32_e32 v31, v21, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v33 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v34 -; GCN-NEXT: v_or_b32_e32 v33, v21, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v35 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v36 -; GCN-NEXT: v_or_b32_e32 v35, v21, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v52, v22, v21 -; GCN-NEXT: v_or_b32_e32 v50, v24, v23 -; GCN-NEXT: v_or_b32_e32 v48, v26, v25 -; GCN-NEXT: v_or_b32_e32 v38, v28, v27 -; GCN-NEXT: v_or_b32_e32 v37, v30, v29 -; GCN-NEXT: v_or_b32_e32 v18, v18, v20 -; GCN-NEXT: v_or_b32_e32 v16, v16, v19 -; GCN-NEXT: v_or_b32_e32 v14, v14, v17 -; GCN-NEXT: v_or_b32_e32 v13, v13, v15 -; GCN-NEXT: v_alignbit_b32 v54, v35, v21, 16 -; GCN-NEXT: v_alignbit_b32 v53, v33, v23, 16 -; GCN-NEXT: v_alignbit_b32 v51, v31, v25, 16 -; GCN-NEXT: v_alignbit_b32 v49, v11, v27, 16 -; GCN-NEXT: v_alignbit_b32 v39, v9, v29, 16 -; GCN-NEXT: v_alignbit_b32 v20, v7, v20, 16 -; GCN-NEXT: v_alignbit_b32 v19, v5, v19, 16 -; GCN-NEXT: v_alignbit_b32 v17, v3, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v1, v15, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v54 -; GCN-NEXT: v_or_b32_e32 v21, v21, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v36 -; GCN-NEXT: v_or_b32_e32 v22, v22, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v53 -; GCN-NEXT: v_or_b32_e32 v24, v24, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v34 -; GCN-NEXT: v_or_b32_e32 v26, v26, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v51 -; GCN-NEXT: v_or_b32_e32 v28, v28, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; GCN-NEXT: v_or_b32_e32 v30, v30, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v49 -; GCN-NEXT: v_or_b32_e32 v32, v32, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v39 -; GCN-NEXT: v_or_b32_e32 v34, v34, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v18, v18, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v16, v16, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v14, v14, v17 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v13, v13, v15 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; GCN-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36f16_to_v36i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v41 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v42 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v53 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v38 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_or_b32_e32 v31, v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v34 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v33, v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v36, v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v35 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v39, v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v50, v21, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v49, v21, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 +; SI-NEXT: v_or_b32_e32 v38, v24, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v35, v25, v23 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_or_b32_e32 v15, v15, v19 +; SI-NEXT: v_or_b32_e32 v12, v12, v18 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_alignbit_b32 v55, v39, v20, 16 +; SI-NEXT: v_alignbit_b32 v54, v36, v22, 16 +; SI-NEXT: v_alignbit_b32 v53, v33, v21, 16 +; SI-NEXT: v_alignbit_b32 v52, v31, v23, 16 +; SI-NEXT: v_alignbit_b32 v51, v13, v24, 16 +; SI-NEXT: v_alignbit_b32 v19, v10, v19, 16 +; SI-NEXT: v_alignbit_b32 v18, v6, v18, 16 +; SI-NEXT: v_alignbit_b32 v16, v3, v16, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v9, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v54 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v34 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v17, v17, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v17, v20, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36f16_to_v36i16: ; VI: ; %bb.0: @@ -13865,7 +30922,7 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_add_f16_e32 v35, 0x200, v35 @@ -13903,7 +30960,7 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 ; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -13968,7 +31025,7 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v17, v35, v17, s6 @@ -14026,7 +31083,7 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v18, v0, s4 @@ -14057,7 +31114,7 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] @@ -14077,7 +31134,7 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -14107,7 +31164,7 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v17, v36, v17, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 @@ -14163,7 +31220,7 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: .LBB58_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v19, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v20, v1, 0x5040100 @@ -14200,3 +31257,937 @@ end: %phi = phi <36 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <36 x i16> %phi } + +define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36f16_to_v36i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v23, v23, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_or_b32_e32 v24, v24, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_or_b32_e32 v27, v27, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v13, v13, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v37 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v22, v22, v36 +; SI-NEXT: v_or_b32_e32 v21, v21, v35 +; SI-NEXT: v_or_b32_e32 v18, v18, v34 +; SI-NEXT: v_or_b32_e32 v15, v15, v33 +; SI-NEXT: v_or_b32_e32 v29, v29, v32 +; SI-NEXT: v_or_b32_e32 v26, v26, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v30 +; SI-NEXT: v_or_b32_e32 v8, v8, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_alignbit_b32 v36, v19, v36, 16 +; SI-NEXT: v_alignbit_b32 v35, v16, v35, 16 +; SI-NEXT: v_alignbit_b32 v34, v13, v34, 16 +; SI-NEXT: v_alignbit_b32 v33, v27, v33, 16 +; SI-NEXT: v_alignbit_b32 v32, v24, v32, 16 +; SI-NEXT: v_alignbit_b32 v31, v23, v31, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v30, 16 +; SI-NEXT: v_alignbit_b32 v12, v3, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v9, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v22, v22, v36 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v36f16_to_v36i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v22, 0x200 +; VI-NEXT: v_add_f16_e32 v20, s16, v22 +; VI-NEXT: v_add_f16_e32 v35, s43, v22 +; VI-NEXT: v_add_f16_e32 v21, s17, v22 +; VI-NEXT: v_add_f16_e32 v34, s42, v22 +; VI-NEXT: v_add_f16_e32 v18, s18, v22 +; VI-NEXT: v_add_f16_e32 v33, s41, v22 +; VI-NEXT: v_add_f16_e32 v19, s19, v22 +; VI-NEXT: v_add_f16_e32 v32, s40, v22 +; VI-NEXT: v_add_f16_e32 v4, s20, v22 +; VI-NEXT: v_add_f16_e32 v31, s15, v22 +; VI-NEXT: v_add_f16_e32 v5, s21, v22 +; VI-NEXT: v_add_f16_e32 v30, s14, v22 +; VI-NEXT: v_add_f16_e32 v6, s22, v22 +; VI-NEXT: v_add_f16_e32 v29, s13, v22 +; VI-NEXT: v_add_f16_e32 v7, s23, v22 +; VI-NEXT: v_add_f16_e32 v28, s12, v22 +; VI-NEXT: v_add_f16_e32 v8, s24, v22 +; VI-NEXT: v_add_f16_e32 v27, s11, v22 +; VI-NEXT: v_add_f16_e32 v9, s25, v22 +; VI-NEXT: v_add_f16_e32 v26, s10, v22 +; VI-NEXT: v_add_f16_e32 v10, s26, v22 +; VI-NEXT: v_add_f16_e32 v25, s9, v22 +; VI-NEXT: v_add_f16_e32 v11, s27, v22 +; VI-NEXT: v_add_f16_e32 v24, s8, v22 +; VI-NEXT: v_add_f16_e32 v12, s28, v22 +; VI-NEXT: v_add_f16_e32 v23, s7, v22 +; VI-NEXT: v_add_f16_e32 v13, s29, v22 +; VI-NEXT: v_add_f16_e32 v22, s6, v22 +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: s_branch .LBB59_5 +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v22, s6 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v23, s7 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v24, s8 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v25, s9 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v26, s10 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v27, s11 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v28, s12 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v29, s13 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v30, s14 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v31, s15 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v32, s40 +; VI-NEXT: v_mov_b32_e32 v19, s19 +; VI-NEXT: v_mov_b32_e32 v33, s41 +; VI-NEXT: v_mov_b32_e32 v18, s18 +; VI-NEXT: v_mov_b32_e32 v34, s42 +; VI-NEXT: v_mov_b32_e32 v21, s17 +; VI-NEXT: v_mov_b32_e32 v35, s43 +; VI-NEXT: v_mov_b32_e32 v20, s16 +; VI-NEXT: .LBB59_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_or_b32_sdwa v4, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v20 +; VI-NEXT: v_mov_b32_e32 v1, v21 +; VI-NEXT: v_mov_b32_e32 v2, v18 +; VI-NEXT: v_mov_b32_e32 v3, v19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v36f16_to_v36i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_mov_b32_e32 v14, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_f16 v12, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_f16 v11, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_f16 v10, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_f16 v9, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_f16 v8, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_f16 v7, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_f16 v6, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_f16 v5, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_f16 v4, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_f16 v19, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_pk_add_f16 v18, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_pk_add_f16 v21, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_f16 v20, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: s_branch .LBB59_5 +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s18 +; GFX9-NEXT: v_mov_b32_e32 v21, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s43 +; GFX9-NEXT: v_mov_b32_e32 v23, s42 +; GFX9-NEXT: v_mov_b32_e32 v24, s41 +; GFX9-NEXT: v_mov_b32_e32 v25, s40 +; GFX9-NEXT: v_mov_b32_e32 v26, s15 +; GFX9-NEXT: v_mov_b32_e32 v27, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s13 +; GFX9-NEXT: v_mov_b32_e32 v29, s12 +; GFX9-NEXT: v_mov_b32_e32 v30, s11 +; GFX9-NEXT: v_mov_b32_e32 v31, s10 +; GFX9-NEXT: v_mov_b32_e32 v32, s9 +; GFX9-NEXT: v_mov_b32_e32 v33, s8 +; GFX9-NEXT: v_mov_b32_e32 v34, s7 +; GFX9-NEXT: v_mov_b32_e32 v35, s6 +; GFX9-NEXT: .LBB59_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v20, v35, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v34, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v18, v33, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v4, v31, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v29, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v28, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v27, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v26, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v23, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v22, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v20 +; GFX9-NEXT: v_mov_b32_e32 v1, v21 +; GFX9-NEXT: v_mov_b32_e32 v2, v18 +; GFX9-NEXT: v_mov_b32_e32 v3, v19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v36i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s14, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s13, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s12, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s11, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s10, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-TRUE16-NEXT: s_branch .LBB59_5 +; GFX11-TRUE16-NEXT: .LBB59_3: +; GFX11-TRUE16-NEXT: s_branch .LBB59_2 +; GFX11-TRUE16-NEXT: .LBB59_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s5 +; GFX11-TRUE16-NEXT: .LBB59_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v28, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v29, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v23, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v24, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v27, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v18, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v24 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v36i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GFX11-FAKE16-NEXT: s_branch .LBB59_5 +; GFX11-FAKE16-NEXT: .LBB59_3: +; GFX11-FAKE16-NEXT: s_branch .LBB59_2 +; GFX11-FAKE16-NEXT: .LBB59_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v8, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v10, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v4, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v0, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5 +; GFX11-FAKE16-NEXT: .LBB59_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <36 x half> %a, splat (half 0xH0200) + %a2 = bitcast <36 x half> %a1 to <36 x i16> + br label %end + +cmp.false: + %a3 = bitcast <36 x half> %a to <36 x i16> + br label %end + +end: + %phi = phi <36 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x i16> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index 67e035ba7d934..64b6ca9e6117e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -1,44 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <20 x float> @bitcast_v20i32_to_v20f32(<20 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v20i32_to_v20f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20i32_to_v20f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i32_to_v20f32: ; VI: ; %bb.0: @@ -156,39 +156,269 @@ end: ret <20 x float> %phi } +define inreg <20 x float> @bitcast_v20i32_to_v20f32_scalar(<20 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i32_to_v20f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v20i32_to_v20f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v20i32_to_v20f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v20i32_to_v20f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_3: +; GFX11-NEXT: .LBB1_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i32> %a, splat (i32 3) + %a2 = bitcast <20 x i32> %a1 to <20 x float> + br label %end + +cmp.false: + %a3 = bitcast <20 x i32> %a to <20 x float> + br label %end + +end: + %phi = phi <20 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x float> %phi +} + define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f32_to_v20i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20f32_to_v20i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f32_to_v20i32: ; VI: ; %bb.0: @@ -197,7 +427,7 @@ define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -219,7 +449,7 @@ define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -230,7 +460,7 @@ define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -252,7 +482,7 @@ define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -264,7 +494,7 @@ define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 @@ -276,7 +506,7 @@ define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -296,39 +526,259 @@ end: ret <20 x i32> %phi } +define inreg <20 x i32> @bitcast_v20f32_to_v20i32_scalar(<20 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f32_to_v20i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v20f32_to_v20i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v20f32_to_v20i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v20f32_to_v20i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB3_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: .LBB3_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <20 x float> %a1 to <20 x i32> + br label %end + +cmp.false: + %a3 = bitcast <20 x float> %a to <20 x i32> + br label %end + +end: + %phi = phi <20 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i32> %phi +} + define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v20i32_to_v10i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20i32_to_v10i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i32_to_v10i64: ; VI: ; %bb.0: @@ -337,7 +787,7 @@ define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 @@ -359,7 +809,7 @@ define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -370,7 +820,7 @@ define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 ; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 @@ -392,7 +842,7 @@ define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -404,7 +854,7 @@ define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 ; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 @@ -426,69 +876,497 @@ define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i32> %a, splat (i32 3) + %a2 = bitcast <20 x i32> %a1 to <10 x i64> + br label %end + +cmp.false: + %a3 = bitcast <20 x i32> %a to <10 x i64> + br label %end + +end: + %phi = phi <10 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i64> %phi +} + +define inreg <10 x i64> @bitcast_v20i32_to_v10i64_scalar(<20 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i32_to_v10i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v20i32_to_v10i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v20i32_to_v10i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v20i32_to_v10i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_3: +; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i32> %a, splat (i32 3) + %a2 = bitcast <20 x i32> %a1 to <10 x i64> + br label %end + +cmp.false: + %a3 = bitcast <20 x i32> %a to <10 x i64> + br label %end + +end: + %phi = phi <10 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i64> %phi +} + +define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v10i64_to_v20i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i64_to_v20i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10i64_to_v20i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB6_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i64_to_v20i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <20 x i32> %a, splat (i32 3) - %a2 = bitcast <20 x i32> %a1 to <10 x i64> + %a1 = add <10 x i64> %a, splat (i64 3) + %a2 = bitcast <10 x i64> %a1 to <20 x i32> br label %end cmp.false: - %a3 = bitcast <20 x i32> %a to <10 x i64> + %a3 = bitcast <10 x i64> %a to <20 x i32> br label %end end: - %phi = phi <10 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <10 x i64> %phi + %phi = phi <20 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i32> %phi } -define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i64_to_v20i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <20 x i32> @bitcast_v10i64_to_v20i32_scalar(<10 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i64_to_v20i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 ; -; VI-LABEL: bitcast_v10i64_to_v20i32: +; VI-LABEL: bitcast_v10i64_to_v20i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 @@ -509,19 +1387,41 @@ define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB7_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 ; -; GFX9-LABEL: bitcast_v10i64_to_v20i32: +; GFX9-LABEL: bitcast_v10i64_to_v20i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 @@ -542,20 +1442,37 @@ define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB7_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 ; -; GFX11-LABEL: bitcast_v10i64_to_v20i32: +; GFX11-LABEL: bitcast_v10i64_to_v20i32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB7_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: .LBB7_4: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo @@ -581,8 +1498,6 @@ define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -602,38 +1517,38 @@ end: } define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v20i32_to_v10f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20i32_to_v10f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i32_to_v10f64: ; VI: ; %bb.0: @@ -642,7 +1557,7 @@ define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 @@ -664,7 +1579,7 @@ define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -675,7 +1590,7 @@ define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 ; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 @@ -697,7 +1612,7 @@ define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -709,7 +1624,7 @@ define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 ; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 @@ -731,7 +1646,7 @@ define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -751,29 +1666,259 @@ end: ret <10 x double> %phi } +define inreg <10 x double> @bitcast_v20i32_to_v10f64_scalar(<20 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i32_to_v10f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v20i32_to_v10f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v20i32_to_v10f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v20i32_to_v10f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB9_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: .LBB9_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i32> %a, splat (i32 3) + %a2 = bitcast <20 x i32> %a1 to <10 x double> + br label %end + +cmp.false: + %a3 = bitcast <20 x i32> %a to <10 x double> + br label %end + +end: + %phi = phi <10 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x double> %phi +} + define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f64_to_v20i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f64_to_v20i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f64_to_v20i32: ; VI: ; %bb.0: @@ -782,7 +1927,7 @@ define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -794,7 +1939,7 @@ define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -805,7 +1950,7 @@ define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -817,7 +1962,7 @@ define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -829,7 +1974,7 @@ define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -841,7 +1986,7 @@ define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -861,202 +2006,411 @@ end: ret <20 x i32> %phi } +define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f64_to_v20i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v10f64_to_v20i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v12, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v10f64_to_v20i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v12, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v10f64_to_v20i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB11_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: .LBB11_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <10 x double> %a1 to <20 x i32> + br label %end + +cmp.false: + %a3 = bitcast <10 x double> %a to <20 x i32> + br label %end + +end: + %phi = phi <20 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i32> %phi +} + define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v20i32_to_v40i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v25, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v27, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v29, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v25, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v27, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v29, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v2, v2, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v4, v4, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v6, v6, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v7, v7, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v8, v8, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v9, v9, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v10, v10, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v11, v11, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v12, v12, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v13, v13, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v14, v14, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v15, v15, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v16, v16, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x4c, v0 -; GCN-NEXT: v_or_b32_e32 v17, v17, v22 -; GCN-NEXT: v_or_b32_e32 v18, v18, v26 -; GCN-NEXT: v_or_b32_e32 v19, v19, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v24 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v28, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20i32_to_v40i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v26, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v26, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i32_to_v40i16: ; VI: ; %bb.0: @@ -1084,7 +2438,7 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -1106,9 +2460,9 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB6_2: ; %Flow +; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_4 +; VI-NEXT: s_cbranch_execz .LBB12_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 @@ -1150,7 +2504,7 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB6_4: ; %end +; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 @@ -1220,7 +2574,7 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -1242,9 +2596,9 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB6_2: ; %Flow +; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 ; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 @@ -1286,7 +2640,7 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB6_4: ; %end +; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v39, v0, s4 @@ -1319,7 +2673,7 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 @@ -1341,7 +2695,7 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: .LBB12_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1372,7 +2726,7 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -1394,9 +2748,9 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 @@ -1438,7 +2792,7 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 @@ -1479,321 +2833,1158 @@ end: ret <40 x i16> %phi } +define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i32_to_v40i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_readfirstlane_b32 s7, v5 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v6 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s28 +; SI-NEXT: v_mov_b32_e32 v5, s26 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s18 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s29, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s27, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s25, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s23, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s21, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s17, v10, 16 +; SI-NEXT: s_lshr_b32 s12, s6, 16 +; SI-NEXT: s_lshr_b32 s13, s8, 16 +; SI-NEXT: s_lshr_b32 s14, s10, 16 +; SI-NEXT: s_lshr_b32 s15, s29, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s28 +; SI-NEXT: v_mov_b32_e32 v5, s26 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s18 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s29, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s27, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s25, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s23, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s21, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s17, v10, 16 +; SI-NEXT: s_lshr_b32 s12, s6, 16 +; SI-NEXT: s_lshr_b32 s13, s8, 16 +; SI-NEXT: s_lshr_b32 s14, s10, 16 +; SI-NEXT: s_lshr_b32 s15, s29, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v20i32_to_v40i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_readfirstlane_b32 s11, v0 +; VI-NEXT: v_readfirstlane_b32 s10, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v2 +; VI-NEXT: v_readfirstlane_b32 s8, v3 +; VI-NEXT: v_readfirstlane_b32 s6, v4 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v5 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s12, s7, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 16 +; VI-NEXT: s_lshr_b32 s14, s8, 16 +; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s40, s10, 16 +; VI-NEXT: s_lshr_b32 s41, s11, 16 +; VI-NEXT: s_lshr_b32 s42, s29, 16 +; VI-NEXT: s_lshr_b32 s43, s28, 16 +; VI-NEXT: s_lshr_b32 s44, s27, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s57, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s21, 16 +; VI-NEXT: s_lshr_b32 s59, s20, 16 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s62, s17, 16 +; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s12, s7, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 16 +; VI-NEXT: s_lshr_b32 s14, s8, 16 +; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s40, s10, 16 +; VI-NEXT: s_lshr_b32 s41, s11, 16 +; VI-NEXT: s_lshr_b32 s42, s29, 16 +; VI-NEXT: s_lshr_b32 s43, s28, 16 +; VI-NEXT: s_lshr_b32 s44, s27, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s57, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s21, 16 +; VI-NEXT: s_lshr_b32 s59, s20, 16 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s62, s17, 16 +; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s63, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s62, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s61, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s60, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s59, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s58, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s57, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s56, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s47, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s46, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s45, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s44, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s43, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_or_b32 s9, s9, s15 +; VI-NEXT: s_or_b32 s8, s8, s14 +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: s_or_b32 s7, s7, s12 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v17, s8 +; VI-NEXT: v_mov_b32_e32 v18, s6 +; VI-NEXT: v_mov_b32_e32 v19, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v20i32_to_v40i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_lshr_b32 s14, s9, 16 +; GFX9-NEXT: s_lshr_b32 s15, s8, 16 +; GFX9-NEXT: s_lshr_b32 s40, s7, 16 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s42, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s44, s27, 16 +; GFX9-NEXT: s_lshr_b32 s45, s26, 16 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: s_lshr_b32 s47, s24, 16 +; GFX9-NEXT: s_lshr_b32 s56, s23, 16 +; GFX9-NEXT: s_lshr_b32 s57, s22, 16 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s20, 16 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s62, s17, 16 +; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_lshr_b32 s14, s9, 16 +; GFX9-NEXT: s_lshr_b32 s15, s8, 16 +; GFX9-NEXT: s_lshr_b32 s40, s7, 16 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s42, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s44, s27, 16 +; GFX9-NEXT: s_lshr_b32 s45, s26, 16 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: s_lshr_b32 s47, s24, 16 +; GFX9-NEXT: s_lshr_b32 s56, s23, 16 +; GFX9-NEXT: s_lshr_b32 s57, s22, 16 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s20, 16 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s62, s17, 16 +; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-TRUE16-LABEL: bitcast_v20i32_to_v40i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s22, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s23, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 +; +; GFX11-FAKE16-LABEL: bitcast_v20i32_to_v40i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s58, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s25, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s26, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s27, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s28, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s29, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB13_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i32> %a, splat (i32 3) + %a2 = bitcast <20 x i32> %a1 to <40 x i16> + br label %end + +cmp.false: + %a3 = bitcast <20 x i32> %a to <40 x i16> + br label %end + +end: + %phi = phi <40 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i16> %phi +} + define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i16_to_v20i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v45 -; GCN-NEXT: v_or_b32_e32 v1, v1, v46 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v44 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v43 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v42 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v41 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v20 -; GCN-NEXT: v_or_b32_e32 v13, v13, v47 -; GCN-NEXT: v_or_b32_e32 v14, v14, v56 -; GCN-NEXT: v_or_b32_e32 v15, v15, v57 -; GCN-NEXT: v_or_b32_e32 v16, v16, v58 -; GCN-NEXT: v_or_b32_e32 v17, v17, v59 -; GCN-NEXT: v_or_b32_e32 v18, v18, v60 -; GCN-NEXT: v_or_b32_e32 v19, v19, v61 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: .LBB7_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v0, v45, v0 -; GCN-NEXT: v_or_b32_e32 v1, v46, v1 -; GCN-NEXT: v_or_b32_e32 v2, v44, v2 -; GCN-NEXT: v_or_b32_e32 v3, v43, v3 -; GCN-NEXT: v_or_b32_e32 v4, v42, v4 -; GCN-NEXT: v_or_b32_e32 v5, v41, v5 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v20, v6 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v20, v7 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v20, v8 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v20, v9 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v20, v10 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v20, v11 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v20, v12 -; GCN-NEXT: v_or_b32_e32 v13, v47, v13 -; GCN-NEXT: v_or_b32_e32 v14, v56, v14 -; GCN-NEXT: v_or_b32_e32 v15, v57, v15 -; GCN-NEXT: v_or_b32_e32 v16, v58, v16 -; GCN-NEXT: v_or_b32_e32 v17, v59, v17 -; GCN-NEXT: v_or_b32_e32 v18, v60, v18 -; GCN-NEXT: v_or_b32_e32 v19, v61, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: .LBB7_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40i16_to_v20i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v43 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v40 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v63 +; SI-NEXT: v_or_b32_e32 v9, v9, v62 +; SI-NEXT: v_or_b32_e32 v10, v10, v61 +; SI-NEXT: v_or_b32_e32 v16, v16, v47 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: v_or_b32_e32 v12, v12, v59 +; SI-NEXT: v_or_b32_e32 v13, v13, v58 +; SI-NEXT: v_or_b32_e32 v14, v14, v57 +; SI-NEXT: v_or_b32_e32 v15, v15, v56 +; SI-NEXT: v_or_b32_e32 v17, v17, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v45 +; SI-NEXT: v_or_b32_e32 v19, v19, v44 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_or_b32_e32 v10, v61, v10 +; SI-NEXT: v_or_b32_e32 v16, v47, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v11, v60, v11 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_or_b32_e32 v13, v58, v13 +; SI-NEXT: v_or_b32_e32 v14, v57, v14 +; SI-NEXT: v_or_b32_e32 v15, v56, v15 +; SI-NEXT: v_or_b32_e32 v17, v46, v17 +; SI-NEXT: v_or_b32_e32 v18, v45, v18 +; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i16_to_v20i32: ; VI: ; %bb.0: @@ -1826,7 +4017,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v19, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v19, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1889,9 +4080,9 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_4 +; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v19, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v43 @@ -1954,7 +4145,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v20, 3, v32 ; VI-NEXT: v_add_u16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: .LBB7_4: ; %end +; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2034,7 +4225,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; kill: killed $vgpr20 @@ -2113,9 +4304,9 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -2174,7 +4365,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_4: ; %end +; GFX9-NEXT: .LBB14_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2203,7 +4394,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2225,7 +4416,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2277,7 +4468,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2299,7 +4490,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2319,381 +4510,1184 @@ end: ret <20 x i32> %phi } +define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i16_to_v20i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v24 +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v4 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v7, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v8, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v10, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v18, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v19, v0, v53 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v40i16_to_v20i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v34, v3 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v37, v1 +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v14, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v37, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v37 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v40i16_to_v20i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v5 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v34, v3 +; GFX9-NEXT: v_mov_b32_e32 v35, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v33, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v13, s76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v20i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i16> %a, splat (i16 3) + %a2 = bitcast <40 x i16> %a1 to <20 x i32> + br label %end + +cmp.false: + %a3 = bitcast <40 x i16> %a to <20 x i32> + br label %end + +end: + %phi = phi <20 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i32> %phi +} + define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v20i32_to_v40f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v56 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v42 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v40 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v54 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v52 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v50 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v20, v19 -; GCN-NEXT: v_or_b32_e32 v13, v38, v50 -; GCN-NEXT: v_or_b32_e32 v14, v36, v54 -; GCN-NEXT: v_or_b32_e32 v15, v34, v41 -; GCN-NEXT: v_or_b32_e32 v16, v32, v55 -; GCN-NEXT: v_or_b32_e32 v17, v30, v53 -; GCN-NEXT: v_or_b32_e32 v19, v28, v51 -; GCN-NEXT: v_or_b32_e32 v20, v27, v49 -; GCN-NEXT: v_or_b32_e32 v26, v26, v39 -; GCN-NEXT: v_or_b32_e32 v25, v25, v37 -; GCN-NEXT: v_or_b32_e32 v24, v24, v35 -; GCN-NEXT: v_or_b32_e32 v23, v23, v33 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_or_b32_e32 v21, v21, v29 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20i32_to_v40f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i32_to_v40f16: ; VI: ; %bb.0: @@ -2721,7 +5715,7 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -2743,9 +5737,9 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB8_2: ; %Flow +; VI-NEXT: .LBB16_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_4 +; VI-NEXT: s_cbranch_execz .LBB16_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 @@ -2787,7 +5781,7 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB8_4: ; %end +; VI-NEXT: .LBB16_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 @@ -2857,7 +5851,7 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -2879,9 +5873,9 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB8_2: ; %Flow +; GFX9-NEXT: .LBB16_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: s_cbranch_execz .LBB16_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 ; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 @@ -2923,7 +5917,7 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB8_4: ; %end +; GFX9-NEXT: .LBB16_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v39, v0, s4 @@ -2956,7 +5950,7 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 @@ -2978,7 +5972,7 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3009,7 +6003,7 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -3031,9 +6025,9 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 @@ -3075,7 +6069,7 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 @@ -3116,405 +6110,1371 @@ end: ret <40 x half> %phi } +define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i32_to_v40f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v6 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s12, s18, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s20, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s41, s23, 16 +; SI-NEXT: s_lshr_b32 s42, s24, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 16 +; SI-NEXT: s_lshr_b32 s44, s26, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s46, s28, 16 +; SI-NEXT: s_lshr_b32 s47, s29, 16 +; SI-NEXT: s_lshr_b32 s56, s11, 16 +; SI-NEXT: s_lshr_b32 s57, s10, 16 +; SI-NEXT: s_lshr_b32 s58, s8, 16 +; SI-NEXT: s_lshr_b32 s59, s7, 16 +; SI-NEXT: s_lshr_b32 s60, s6, 16 +; SI-NEXT: s_lshr_b32 s61, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v39, v39, v48 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v20i32_to_v40f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_readfirstlane_b32 s11, v0 +; VI-NEXT: v_readfirstlane_b32 s10, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v2 +; VI-NEXT: v_readfirstlane_b32 s8, v3 +; VI-NEXT: v_readfirstlane_b32 s6, v4 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v5 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s12, s7, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 16 +; VI-NEXT: s_lshr_b32 s14, s8, 16 +; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s40, s10, 16 +; VI-NEXT: s_lshr_b32 s41, s11, 16 +; VI-NEXT: s_lshr_b32 s42, s29, 16 +; VI-NEXT: s_lshr_b32 s43, s28, 16 +; VI-NEXT: s_lshr_b32 s44, s27, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s57, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s21, 16 +; VI-NEXT: s_lshr_b32 s59, s20, 16 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s62, s17, 16 +; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s12, s7, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 16 +; VI-NEXT: s_lshr_b32 s14, s8, 16 +; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s40, s10, 16 +; VI-NEXT: s_lshr_b32 s41, s11, 16 +; VI-NEXT: s_lshr_b32 s42, s29, 16 +; VI-NEXT: s_lshr_b32 s43, s28, 16 +; VI-NEXT: s_lshr_b32 s44, s27, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s57, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s21, 16 +; VI-NEXT: s_lshr_b32 s59, s20, 16 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s62, s17, 16 +; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s63, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s62, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s61, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s60, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s59, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s58, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s57, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s56, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s47, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s46, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s45, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s44, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s43, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_or_b32 s9, s9, s15 +; VI-NEXT: s_or_b32 s8, s8, s14 +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: s_or_b32 s7, s7, s12 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v17, s8 +; VI-NEXT: v_mov_b32_e32 v18, s6 +; VI-NEXT: v_mov_b32_e32 v19, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v20i32_to_v40f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_lshr_b32 s14, s9, 16 +; GFX9-NEXT: s_lshr_b32 s15, s8, 16 +; GFX9-NEXT: s_lshr_b32 s40, s7, 16 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s42, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s44, s27, 16 +; GFX9-NEXT: s_lshr_b32 s45, s26, 16 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: s_lshr_b32 s47, s24, 16 +; GFX9-NEXT: s_lshr_b32 s56, s23, 16 +; GFX9-NEXT: s_lshr_b32 s57, s22, 16 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s20, 16 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s62, s17, 16 +; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_lshr_b32 s14, s9, 16 +; GFX9-NEXT: s_lshr_b32 s15, s8, 16 +; GFX9-NEXT: s_lshr_b32 s40, s7, 16 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s42, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s44, s27, 16 +; GFX9-NEXT: s_lshr_b32 s45, s26, 16 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: s_lshr_b32 s47, s24, 16 +; GFX9-NEXT: s_lshr_b32 s56, s23, 16 +; GFX9-NEXT: s_lshr_b32 s57, s22, 16 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s20, 16 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s62, s17, 16 +; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-TRUE16-LABEL: bitcast_v20i32_to_v40f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-TRUE16-NEXT: .LBB17_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s22, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s23, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB17_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB17_2 +; +; GFX11-FAKE16-LABEL: bitcast_v20i32_to_v40f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s58, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-FAKE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-FAKE16-NEXT: .LBB17_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s25, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s26, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s27, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s28, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s29, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB17_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i32> %a, splat (i32 3) + %a2 = bitcast <20 x i32> %a1 to <40 x half> + br label %end + +cmp.false: + %a3 = bitcast <20 x i32> %a to <40 x half> + br label %end + +end: + %phi = phi <40 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x half> %phi +} + define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v40f16_to_v20i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; GCN-NEXT: v_or_b32_e32 v0, v38, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; GCN-NEXT: v_or_b32_e32 v2, v34, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; GCN-NEXT: v_or_b32_e32 v4, v62, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; GCN-NEXT: v_or_b32_e32 v5, v60, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v50 -; GCN-NEXT: v_or_b32_e32 v6, v46, v6 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v8, v42, v8 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v20, v9 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v20, v10 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v20, v11 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v20, v12 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v20, v13 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v20, v14 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v20, v15 -; GCN-NEXT: v_or_b32_e32 v16, v53, v16 -; GCN-NEXT: v_or_b32_e32 v17, v51, v17 -; GCN-NEXT: v_or_b32_e32 v18, v49, v18 -; GCN-NEXT: v_or_b32_e32 v19, v48, v19 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v34 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v32 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v62 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v60 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v56 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v47 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v45 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v43 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v41 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v40 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v48 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_or_b32_e32 v7, v9, v8 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v19, v18 -; GCN-NEXT: v_or_b32_e32 v13, v21, v20 -; GCN-NEXT: v_or_b32_e32 v14, v23, v22 -; GCN-NEXT: v_or_b32_e32 v15, v25, v24 -; GCN-NEXT: v_or_b32_e32 v16, v27, v26 -; GCN-NEXT: v_or_b32_e32 v17, v29, v28 -; GCN-NEXT: v_or_b32_e32 v18, v31, v30 -; GCN-NEXT: v_or_b32_e32 v19, v33, v32 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40f16_to_v20i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_or_b32_e32 v9, v44, v9 +; SI-NEXT: v_or_b32_e32 v10, v42, v10 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; SI-NEXT: v_or_b32_e32 v18, v50, v18 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40f16_to_v20i32: ; VI: ; %bb.0: @@ -3547,7 +7507,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v19, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v19, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -3610,9 +7570,9 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB9_2: ; %Flow +; VI-NEXT: .LBB18_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_4 +; VI-NEXT: s_cbranch_execz .LBB18_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v19, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v43, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3675,7 +7635,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: .LBB9_4: ; %end +; VI-NEXT: .LBB18_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -3755,7 +7715,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; kill: killed $vgpr20 @@ -3834,9 +7794,9 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: .LBB9_2: ; %Flow +; GFX9-NEXT: .LBB18_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -3896,7 +7856,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_4: ; %end +; GFX9-NEXT: .LBB18_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -3925,7 +7885,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -3947,7 +7907,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: .LBB18_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3999,7 +7959,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -4021,69 +7981,1173 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: .LBB18_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <40 x half> %a, splat (half 0xH0200) - %a2 = bitcast <40 x half> %a1 to <20 x i32> + %a1 = fadd <40 x half> %a, splat (half 0xH0200) + %a2 = bitcast <40 x half> %a1 to <20 x i32> + br label %end + +cmp.false: + %a3 = bitcast <40 x half> %a to <20 x i32> + br label %end + +end: + %phi = phi <20 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i32> %phi +} + +define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40f16_to_v20i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_or_b32_e32 v11, v55, v11 +; SI-NEXT: v_or_b32_e32 v12, v53, v12 +; SI-NEXT: v_or_b32_e32 v13, v51, v13 +; SI-NEXT: v_or_b32_e32 v14, v49, v14 +; SI-NEXT: v_or_b32_e32 v15, v31, v15 +; SI-NEXT: v_or_b32_e32 v16, v29, v16 +; SI-NEXT: v_or_b32_e32 v17, v27, v17 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_mov_b32_e32 v48, v21 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: v_mov_b32_e32 v49, v20 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v50, v22 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v51 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_mov_b32_e32 v45, v52 +; SI-NEXT: v_mov_b32_e32 v52, v27 +; SI-NEXT: v_mov_b32_e32 v46, v53 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v32 +; SI-NEXT: v_mov_b32_e32 v33, v47 +; SI-NEXT: v_mov_b32_e32 v47, v54 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: v_mov_b32_e32 v42, v56 +; SI-NEXT: v_mov_b32_e32 v56, v55 +; SI-NEXT: v_mov_b32_e32 v55, v30 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v36, v57 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v58 +; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v32, v38 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: v_mov_b32_e32 v35, v25 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v25, v35 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v32 +; SI-NEXT: v_mov_b32_e32 v24, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v58 +; SI-NEXT: v_mov_b32_e32 v58, v39 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v57 +; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: v_mov_b32_e32 v56, v42 +; SI-NEXT: v_mov_b32_e32 v32, v41 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v51 +; SI-NEXT: v_mov_b32_e32 v51, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v20, v49 +; SI-NEXT: v_mov_b32_e32 v49, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v21, v48 +; SI-NEXT: v_mov_b32_e32 v48, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v54 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v28, v53 +; SI-NEXT: v_mov_b32_e32 v53, v46 +; SI-NEXT: v_mov_b32_e32 v27, v52 +; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v40f16_to_v20i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v32, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v34, v3 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v36, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v19, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v37, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v36, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v35, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v18, v20, v18 +; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v40f16_to_v20i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v5 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v34, v3 +; GFX9-NEXT: v_mov_b32_e32 v35, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v33, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v13, s76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB19_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB19_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB19_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v20i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB19_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB19_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB19_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <40 x half> %a, splat (half 0xH0200) + %a2 = bitcast <40 x half> %a1 to <20 x i32> + br label %end + +cmp.false: + %a3 = bitcast <40 x half> %a to <20 x i32> + br label %end + +end: + %phi = phi <20 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i32> %phi +} + +define <10 x i64> @bitcast_v20f32_to_v10i64(<20 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v20f32_to_v10i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20f32_to_v10i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f32_to_v10i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f32_to_v10i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <20 x float> %a1 to <10 x i64> br label %end cmp.false: - %a3 = bitcast <40 x half> %a to <20 x i32> + %a3 = bitcast <20 x float> %a to <10 x i64> br label %end end: - %phi = phi <20 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <20 x i32> %phi + %phi = phi <10 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i64> %phi } -define <10 x i64> @bitcast_v20f32_to_v10i64(<20 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f32_to_v10i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <10 x i64> @bitcast_v20f32_to_v10i64_scalar(<20 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f32_to_v10i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: s_branch .LBB21_2 ; -; VI-LABEL: bitcast_v20f32_to_v10i64: +; VI-LABEL: bitcast_v20f32_to_v10i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 @@ -4104,19 +9168,41 @@ define <10 x i64> @bitcast_v20f32_to_v10i64(<20 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB21_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 ; -; GFX9-LABEL: bitcast_v20f32_to_v10i64: +; GFX9-LABEL: bitcast_v20f32_to_v10i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 @@ -4137,20 +9223,37 @@ define <10 x i64> @bitcast_v20f32_to_v10i64(<20 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB21_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 ; -; GFX11-LABEL: bitcast_v20f32_to_v10i64: +; GFX11-LABEL: bitcast_v20f32_to_v10i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB21_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: .LBB21_4: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 @@ -4161,8 +9264,6 @@ define <10 x i64> @bitcast_v20f32_to_v10i64(<20 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB10_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4182,38 +9283,38 @@ end: } define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i64_to_v20f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10i64_to_v20f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i64_to_v20f32: ; VI: ; %bb.0: @@ -4222,7 +9323,7 @@ define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc @@ -4244,7 +9345,7 @@ define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4255,7 +9356,7 @@ define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc @@ -4277,7 +9378,7 @@ define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4289,7 +9390,7 @@ define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -4316,7 +9417,7 @@ define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4336,39 +9437,274 @@ end: ret <20 x float> %phi } +define inreg <20 x float> @bitcast_v10i64_to_v20f32_scalar(<10 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i64_to_v20f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v10i64_to_v20f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v10i64_to_v20f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v10i64_to_v20f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB23_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: .LBB23_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i64> %a, splat (i64 3) + %a2 = bitcast <10 x i64> %a1 to <20 x float> + br label %end + +cmp.false: + %a3 = bitcast <10 x i64> %a to <20 x float> + br label %end + +end: + %phi = phi <20 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x float> %phi +} + define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f32_to_v10f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20f32_to_v10f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f32_to_v10f64: ; VI: ; %bb.0: @@ -4377,7 +9713,7 @@ define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -4399,7 +9735,7 @@ define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end +; VI-NEXT: .LBB24_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4410,7 +9746,7 @@ define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -4432,7 +9768,7 @@ define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: .LBB24_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4444,7 +9780,7 @@ define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 @@ -4456,8 +9792,228 @@ define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB12_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <20 x float> %a1 to <10 x double> + br label %end + +cmp.false: + %a3 = bitcast <20 x float> %a to <10 x double> + br label %end + +end: + %phi = phi <10 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x double> %phi +} + +define inreg <10 x double> @bitcast_v20f32_to_v10f64_scalar(<20 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f32_to_v10f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: s_branch .LBB25_2 +; +; VI-LABEL: bitcast_v20f32_to_v10f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v20f32_to_v10f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-LABEL: bitcast_v20f32_to_v10f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB25_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4477,28 +10033,28 @@ end: } define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f64_to_v20f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f64_to_v20f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f64_to_v20f32: ; VI: ; %bb.0: @@ -4507,7 +10063,7 @@ define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_cbranch_execz .LBB26_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -4519,7 +10075,7 @@ define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4530,7 +10086,7 @@ define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -4542,7 +10098,7 @@ define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4554,7 +10110,7 @@ define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -4566,7 +10122,7 @@ define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4586,202 +10142,411 @@ end: ret <20 x float> %phi } +define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f64_to_v20f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v10f64_to_v20f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v12, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v10f64_to_v20f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v12, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v10f64_to_v20f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB27_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: .LBB27_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <10 x double> %a1 to <20 x float> + br label %end + +cmp.false: + %a3 = bitcast <10 x double> %a to <20 x float> + br label %end + +end: + %phi = phi <20 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x float> %phi +} + define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f32_to_v40i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v25, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v27, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v29, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v25, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v27, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v29, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v2, v2, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v4, v4, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v6, v6, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v7, v7, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v8, v8, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v9, v9, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v10, v10, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v11, v11, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v12, v12, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v13, v13, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v14, v14, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v15, v15, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v16, v16, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x4c, v0 -; GCN-NEXT: v_or_b32_e32 v17, v17, v22 -; GCN-NEXT: v_or_b32_e32 v18, v18, v26 -; GCN-NEXT: v_or_b32_e32 v19, v19, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v24 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v28, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20f32_to_v40i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v26, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v26, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f32_to_v40i16: ; VI: ; %bb.0: @@ -4809,7 +10574,7 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -4831,9 +10596,9 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB14_2: ; %Flow +; VI-NEXT: .LBB28_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_4 +; VI-NEXT: s_cbranch_execz .LBB28_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -4875,7 +10640,7 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB14_4: ; %end +; VI-NEXT: .LBB28_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 @@ -4945,7 +10710,7 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -4967,9 +10732,9 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB14_2: ; %Flow +; GFX9-NEXT: .LBB28_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: s_cbranch_execz .LBB28_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -5011,7 +10776,7 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB14_4: ; %end +; GFX9-NEXT: .LBB28_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v39, v0, s4 @@ -5044,7 +10809,7 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 @@ -5056,7 +10821,7 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5087,7 +10852,7 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -5109,9 +10874,9 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB28_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 @@ -5143,7 +10908,7 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: .LBB28_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 @@ -5184,321 +10949,1144 @@ end: ret <40 x i16> %phi } +define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f32_to_v40i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_mov_b32_e32 v21, s16 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v9, s27 +; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v7, s29 +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v18, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v23, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v24, v7, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v9, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v31, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v33, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v35, v17, v19, 16 +; SI-NEXT: v_alignbit_b32 v37, v20, v21, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_alignbit_b32 v18, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v23, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v24, v7, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v9, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v31, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v33, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v35, v17, v19, 16 +; SI-NEXT: v_alignbit_b32 v37, v20, v21, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v21, v21, v37 +; SI-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v39 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_v20f32_to_v40i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v20, s16 +; VI-NEXT: v_mov_b32_e32 v19, s17 +; VI-NEXT: v_mov_b32_e32 v18, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_mov_b32_e32 v14, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB29_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; VI-NEXT: v_or_b32_sdwa v7, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v8, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_or_b32_sdwa v24, v20, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; VI-NEXT: v_or_b32_sdwa v25, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v39 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_or_b32_sdwa v20, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v21, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_or_b32_sdwa v22, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; VI-NEXT: v_or_b32_sdwa v23, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v20 +; VI-NEXT: v_mov_b32_e32 v3, v21 +; VI-NEXT: v_mov_b32_e32 v4, v22 +; VI-NEXT: v_mov_b32_e32 v5, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: s_branch .LBB29_2 +; +; GFX9-LABEL: bitcast_v20f32_to_v40i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v18, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-NEXT: v_mov_b32_e32 v14, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v30, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v29, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v28, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v39, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v18, v27, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v7, v38, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v37, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v36, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v35, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v34, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v33, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v32, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v19, v26, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-NEXT: v_mov_b32_e32 v3, v21 +; GFX9-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: s_branch .LBB29_2 +; +; GFX11-TRUE16-LABEL: bitcast_v20f32_to_v40i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v17, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v15, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-TRUE16-NEXT: .LBB29_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v36, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v33, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v32, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v28, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v27, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v38, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v37, 16, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v29, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v24, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v20 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB29_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB29_2 +; +; GFX11-FAKE16-LABEL: bitcast_v20f32_to_v40i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: .LBB29_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v36, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v32, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v31, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v28, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v27, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v38, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v37, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v29, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v25, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v24, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v20 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB29_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: s_branch .LBB29_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <20 x float> %a1 to <40 x i16> + br label %end + +cmp.false: + %a3 = bitcast <20 x float> %a to <40 x i16> + br label %end + +end: + %phi = phi <40 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i16> %phi +} + define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i16_to_v20f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v45 -; GCN-NEXT: v_or_b32_e32 v1, v1, v46 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v44 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v43 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v42 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v41 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v20 -; GCN-NEXT: v_or_b32_e32 v13, v13, v47 -; GCN-NEXT: v_or_b32_e32 v14, v14, v56 -; GCN-NEXT: v_or_b32_e32 v15, v15, v57 -; GCN-NEXT: v_or_b32_e32 v16, v16, v58 -; GCN-NEXT: v_or_b32_e32 v17, v17, v59 -; GCN-NEXT: v_or_b32_e32 v18, v18, v60 -; GCN-NEXT: v_or_b32_e32 v19, v19, v61 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: .LBB15_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v0, v45, v0 -; GCN-NEXT: v_or_b32_e32 v1, v46, v1 -; GCN-NEXT: v_or_b32_e32 v2, v44, v2 -; GCN-NEXT: v_or_b32_e32 v3, v43, v3 -; GCN-NEXT: v_or_b32_e32 v4, v42, v4 -; GCN-NEXT: v_or_b32_e32 v5, v41, v5 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v20, v6 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v20, v7 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v20, v8 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v20, v9 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v20, v10 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v20, v11 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v20, v12 -; GCN-NEXT: v_or_b32_e32 v13, v47, v13 -; GCN-NEXT: v_or_b32_e32 v14, v56, v14 -; GCN-NEXT: v_or_b32_e32 v15, v57, v15 -; GCN-NEXT: v_or_b32_e32 v16, v58, v16 -; GCN-NEXT: v_or_b32_e32 v17, v59, v17 -; GCN-NEXT: v_or_b32_e32 v18, v60, v18 -; GCN-NEXT: v_or_b32_e32 v19, v61, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: .LBB15_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40i16_to_v20f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v43 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v40 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v63 +; SI-NEXT: v_or_b32_e32 v9, v9, v62 +; SI-NEXT: v_or_b32_e32 v10, v10, v61 +; SI-NEXT: v_or_b32_e32 v16, v16, v47 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: v_or_b32_e32 v12, v12, v59 +; SI-NEXT: v_or_b32_e32 v13, v13, v58 +; SI-NEXT: v_or_b32_e32 v14, v14, v57 +; SI-NEXT: v_or_b32_e32 v15, v15, v56 +; SI-NEXT: v_or_b32_e32 v17, v17, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v45 +; SI-NEXT: v_or_b32_e32 v19, v19, v44 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB30_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_or_b32_e32 v10, v61, v10 +; SI-NEXT: v_or_b32_e32 v16, v47, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v11, v60, v11 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_or_b32_e32 v13, v58, v13 +; SI-NEXT: v_or_b32_e32 v14, v57, v14 +; SI-NEXT: v_or_b32_e32 v15, v56, v15 +; SI-NEXT: v_or_b32_e32 v17, v46, v17 +; SI-NEXT: v_or_b32_e32 v18, v45, v18 +; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: .LBB30_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i16_to_v20f32: ; VI: ; %bb.0: @@ -5531,7 +12119,7 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v19, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v19, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -5594,9 +12182,9 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB15_2: ; %Flow +; VI-NEXT: .LBB30_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_4 +; VI-NEXT: s_cbranch_execz .LBB30_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v19, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v43 @@ -5659,7 +12247,7 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v20, 3, v32 ; VI-NEXT: v_add_u16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: .LBB15_4: ; %end +; VI-NEXT: .LBB30_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5739,7 +12327,7 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; kill: killed $vgpr20 @@ -5818,9 +12406,9 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: .LBB15_2: ; %Flow +; GFX9-NEXT: .LBB30_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -5879,7 +12467,7 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_4: ; %end +; GFX9-NEXT: .LBB30_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5908,7 +12496,7 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -5930,7 +12518,7 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB15_2: ; %end +; GFX11-TRUE16-NEXT: .LBB30_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5982,7 +12570,7 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -6004,9 +12592,801 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB15_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB30_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i16> %a, splat (i16 3) + %a2 = bitcast <40 x i16> %a1 to <20 x float> + br label %end + +cmp.false: + %a3 = bitcast <40 x i16> %a to <20 x float> + br label %end + +end: + %phi = phi <20 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x float> %phi +} + +define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i16_to_v20f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v24 +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v4 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v7, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v8, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v10, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v18, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v19, v0, v53 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v40i16_to_v20f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v34, v3 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v37, v1 +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v14, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v37, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v37 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v40i16_to_v20f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v5 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v34, v3 +; GFX9-NEXT: v_mov_b32_e32 v35, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v33, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v13, s76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB31_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB31_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB31_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v20f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB31_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB31_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB31_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6025,380 +13405,391 @@ end: } define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f32_to_v40f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: .LBB16_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: .LBB16_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v56 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v42 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v40 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v54 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v52 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v50 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v20, v19 -; GCN-NEXT: v_or_b32_e32 v13, v38, v50 -; GCN-NEXT: v_or_b32_e32 v14, v36, v54 -; GCN-NEXT: v_or_b32_e32 v15, v34, v41 -; GCN-NEXT: v_or_b32_e32 v16, v32, v55 -; GCN-NEXT: v_or_b32_e32 v17, v30, v53 -; GCN-NEXT: v_or_b32_e32 v19, v28, v51 -; GCN-NEXT: v_or_b32_e32 v20, v27, v49 -; GCN-NEXT: v_or_b32_e32 v26, v26, v39 -; GCN-NEXT: v_or_b32_e32 v25, v25, v37 -; GCN-NEXT: v_or_b32_e32 v24, v24, v35 -; GCN-NEXT: v_or_b32_e32 v23, v23, v33 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_or_b32_e32 v21, v21, v29 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20f32_to_v40f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: .LBB32_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f32_to_v40f16: ; VI: ; %bb.0: @@ -6426,7 +13817,7 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -6448,9 +13839,9 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB16_2: ; %Flow +; VI-NEXT: .LBB32_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_4 +; VI-NEXT: s_cbranch_execz .LBB32_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -6492,7 +13883,7 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB16_4: ; %end +; VI-NEXT: .LBB32_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 @@ -6562,7 +13953,7 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -6584,9 +13975,9 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB16_2: ; %Flow +; GFX9-NEXT: .LBB32_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: s_cbranch_execz .LBB32_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -6628,7 +14019,7 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB16_4: ; %end +; GFX9-NEXT: .LBB32_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v39, v0, s4 @@ -6661,7 +14052,7 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 @@ -6673,7 +14064,7 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: .LBB32_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6699,91 +14090,1038 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <20 x float> %a1 to <40 x half> + br label %end + +cmp.false: + %a3 = bitcast <20 x float> %a to <40 x half> + br label %end + +end: + %phi = phi <40 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x half> %phi +} + +define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f32_to_v40f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v6 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s10, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s9, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v39, v39, v48 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v20f32_to_v40f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v20, s16 +; VI-NEXT: v_mov_b32_e32 v19, s17 +; VI-NEXT: v_mov_b32_e32 v18, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_mov_b32_e32 v14, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB33_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; VI-NEXT: v_or_b32_sdwa v7, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v8, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_or_b32_sdwa v24, v20, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; VI-NEXT: v_or_b32_sdwa v25, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v39 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_or_b32_sdwa v20, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v21, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_or_b32_sdwa v22, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; VI-NEXT: v_or_b32_sdwa v23, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v20 +; VI-NEXT: v_mov_b32_e32 v3, v21 +; VI-NEXT: v_mov_b32_e32 v4, v22 +; VI-NEXT: v_mov_b32_e32 v5, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: s_branch .LBB33_2 +; +; GFX9-LABEL: bitcast_v20f32_to_v40f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v18, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-NEXT: v_mov_b32_e32 v14, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v30, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v29, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v28, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v39, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v18, v27, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v7, v38, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v37, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v36, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v35, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v34, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v33, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v32, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v19, v26, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-NEXT: v_mov_b32_e32 v3, v21 +; GFX9-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: s_branch .LBB33_2 +; +; GFX11-TRUE16-LABEL: bitcast_v20f32_to_v40f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v17, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v15, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-TRUE16-NEXT: .LBB33_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v36, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v33, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v32, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v28, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v27, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v38, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v37, 16, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v29, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v24, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v20 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB33_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB33_2 +; +; GFX11-FAKE16-LABEL: bitcast_v20f32_to_v40f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: .LBB33_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v36, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v32, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v31, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v28, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v27, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v38, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v37, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v29, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v25, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v24, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v20 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB33_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: s_branch .LBB33_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6802,404 +15140,431 @@ end: } define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v40f16_to_v20f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; GCN-NEXT: v_or_b32_e32 v0, v38, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; GCN-NEXT: v_or_b32_e32 v2, v34, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; GCN-NEXT: v_or_b32_e32 v4, v62, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; GCN-NEXT: v_or_b32_e32 v5, v60, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v50 -; GCN-NEXT: v_or_b32_e32 v6, v46, v6 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v8, v42, v8 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v20, v9 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v20, v10 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v20, v11 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v20, v12 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v20, v13 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v20, v14 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v20, v15 -; GCN-NEXT: v_or_b32_e32 v16, v53, v16 -; GCN-NEXT: v_or_b32_e32 v17, v51, v17 -; GCN-NEXT: v_or_b32_e32 v18, v49, v18 -; GCN-NEXT: v_or_b32_e32 v19, v48, v19 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: .LBB17_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v34 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v32 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v62 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v60 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v56 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v47 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v45 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v43 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v41 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v40 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v48 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_or_b32_e32 v7, v9, v8 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v19, v18 -; GCN-NEXT: v_or_b32_e32 v13, v21, v20 -; GCN-NEXT: v_or_b32_e32 v14, v23, v22 -; GCN-NEXT: v_or_b32_e32 v15, v25, v24 -; GCN-NEXT: v_or_b32_e32 v16, v27, v26 -; GCN-NEXT: v_or_b32_e32 v17, v29, v28 -; GCN-NEXT: v_or_b32_e32 v18, v31, v30 -; GCN-NEXT: v_or_b32_e32 v19, v33, v32 -; GCN-NEXT: .LBB17_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40f16_to_v20f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_or_b32_e32 v9, v44, v9 +; SI-NEXT: v_or_b32_e32 v10, v42, v10 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; SI-NEXT: v_or_b32_e32 v18, v50, v18 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40f16_to_v20f32: ; VI: ; %bb.0: @@ -7232,7 +15597,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v19, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v19, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -7295,9 +15660,9 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB17_2: ; %Flow +; VI-NEXT: .LBB34_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_4 +; VI-NEXT: s_cbranch_execz .LBB34_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v19, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v43, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -7360,7 +15725,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: .LBB17_4: ; %end +; VI-NEXT: .LBB34_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -7440,7 +15805,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; kill: killed $vgpr20 @@ -7519,9 +15884,9 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: .LBB17_2: ; %Flow +; GFX9-NEXT: .LBB34_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_4 +; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -7581,7 +15946,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_4: ; %end +; GFX9-NEXT: .LBB34_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -7610,7 +15975,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -7632,7 +15997,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7684,7 +16049,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -7706,69 +16071,1188 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <40 x half> %a, splat (half 0xH0200) - %a2 = bitcast <40 x half> %a1 to <20 x float> + %a1 = fadd <40 x half> %a, splat (half 0xH0200) + %a2 = bitcast <40 x half> %a1 to <20 x float> + br label %end + +cmp.false: + %a3 = bitcast <40 x half> %a to <20 x float> + br label %end + +end: + %phi = phi <20 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x float> %phi +} + +define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40f16_to_v20f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_or_b32_e32 v11, v55, v11 +; SI-NEXT: v_or_b32_e32 v12, v53, v12 +; SI-NEXT: v_or_b32_e32 v13, v51, v13 +; SI-NEXT: v_or_b32_e32 v14, v49, v14 +; SI-NEXT: v_or_b32_e32 v15, v31, v15 +; SI-NEXT: v_or_b32_e32 v16, v29, v16 +; SI-NEXT: v_or_b32_e32 v17, v27, v17 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_mov_b32_e32 v48, v21 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: v_mov_b32_e32 v49, v20 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v50, v22 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v51 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_mov_b32_e32 v45, v52 +; SI-NEXT: v_mov_b32_e32 v52, v27 +; SI-NEXT: v_mov_b32_e32 v46, v53 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v32 +; SI-NEXT: v_mov_b32_e32 v33, v47 +; SI-NEXT: v_mov_b32_e32 v47, v54 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: v_mov_b32_e32 v42, v56 +; SI-NEXT: v_mov_b32_e32 v56, v55 +; SI-NEXT: v_mov_b32_e32 v55, v30 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v36, v57 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v58 +; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v32, v38 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: v_mov_b32_e32 v35, v25 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v25, v35 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v32 +; SI-NEXT: v_mov_b32_e32 v24, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v58 +; SI-NEXT: v_mov_b32_e32 v58, v39 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v57 +; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: v_mov_b32_e32 v56, v42 +; SI-NEXT: v_mov_b32_e32 v32, v41 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v51 +; SI-NEXT: v_mov_b32_e32 v51, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v20, v49 +; SI-NEXT: v_mov_b32_e32 v49, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v21, v48 +; SI-NEXT: v_mov_b32_e32 v48, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v54 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v28, v53 +; SI-NEXT: v_mov_b32_e32 v53, v46 +; SI-NEXT: v_mov_b32_e32 v27, v52 +; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v40f16_to_v20f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v32, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v34, v3 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v36, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v19, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v37, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v36, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v35, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v18, v20, v18 +; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v40f16_to_v20f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v5 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v34, v3 +; GFX9-NEXT: v_mov_b32_e32 v35, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v33, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v13, s76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB35_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB35_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v20f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB35_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB35_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB35_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <40 x half> %a, splat (half 0xH0200) + %a2 = bitcast <40 x half> %a1 to <20 x float> + br label %end + +cmp.false: + %a3 = bitcast <40 x half> %a to <20 x float> + br label %end + +end: + %phi = phi <20 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x float> %phi +} + +define <10 x double> @bitcast_v10i64_to_v10f64(<10 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v10i64_to_v10f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i64_to_v10f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10i64_to_v10f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i64_to_v10f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i64> %a, splat (i64 3) + %a2 = bitcast <10 x i64> %a1 to <10 x double> br label %end cmp.false: - %a3 = bitcast <40 x half> %a to <20 x float> + %a3 = bitcast <10 x i64> %a to <10 x double> br label %end end: - %phi = phi <20 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <20 x float> %phi + %phi = phi <10 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x double> %phi } -define <10 x double> @bitcast_v10i64_to_v10f64(<10 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i64_to_v10f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <10 x double> @bitcast_v10i64_to_v10f64_scalar(<10 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i64_to_v10f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 ; -; VI-LABEL: bitcast_v10i64_to_v10f64: +; VI-LABEL: bitcast_v10i64_to_v10f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 @@ -7789,19 +17273,41 @@ define <10 x double> @bitcast_v10i64_to_v10f64(<10 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: .LBB18_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB37_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 ; -; GFX9-LABEL: bitcast_v10i64_to_v10f64: +; GFX9-LABEL: bitcast_v10i64_to_v10f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 @@ -7822,20 +17328,37 @@ define <10 x double> @bitcast_v10i64_to_v10f64(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: .LBB18_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB37_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 ; -; GFX11-LABEL: bitcast_v10i64_to_v10f64: +; GFX11-LABEL: bitcast_v10i64_to_v10f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB37_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: .LBB37_4: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -7861,8 +17384,6 @@ define <10 x double> @bitcast_v10i64_to_v10f64(<10 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo ; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: .LBB18_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7882,28 +17403,28 @@ end: } define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f64_to_v10i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f64_to_v10i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f64_to_v10i64: ; VI: ; %bb.0: @@ -7912,7 +17433,7 @@ define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -7924,7 +17445,7 @@ define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7935,7 +17456,7 @@ define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -7947,7 +17468,7 @@ define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end +; GFX9-NEXT: .LBB38_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7959,7 +17480,7 @@ define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -7971,7 +17492,7 @@ define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7991,202 +17512,411 @@ end: ret <10 x i64> %phi } +define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f64_to_v10i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v10f64_to_v10i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v12, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v10f64_to_v10i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v12, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v10f64_to_v10i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB39_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: .LBB39_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <10 x double> %a1 to <10 x i64> + br label %end + +cmp.false: + %a3 = bitcast <10 x double> %a to <10 x i64> + br label %end + +end: + %phi = phi <10 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i64> %phi +} + define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i64_to_v40i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v24, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v26, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v29, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v24, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v26, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v29, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v2, v2, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v4, v4, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v6, v6, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v7, v7, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v8, v8, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v9, v9, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v10, v10, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v11, v11, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v12, v12, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v13, v13, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v14, v14, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v15, v15, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v16, v16, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x4c, v0 -; GCN-NEXT: v_or_b32_e32 v17, v17, v22 -; GCN-NEXT: v_or_b32_e32 v18, v18, v27 -; GCN-NEXT: v_or_b32_e32 v19, v19, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v25 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v28, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10i64_to_v40i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v30, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v30, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i64_to_v40i16: ; VI: ; %bb.0: @@ -8214,7 +17944,7 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -8236,9 +17966,9 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB20_2: ; %Flow +; VI-NEXT: .LBB40_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_4 +; VI-NEXT: s_cbranch_execz .LBB40_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc @@ -8280,7 +18010,7 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB20_4: ; %end +; VI-NEXT: .LBB40_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 @@ -8350,7 +18080,7 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -8372,9 +18102,9 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB20_2: ; %Flow +; GFX9-NEXT: .LBB40_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_4 +; GFX9-NEXT: s_cbranch_execz .LBB40_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc @@ -8416,7 +18146,7 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB20_4: ; %end +; GFX9-NEXT: .LBB40_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v39, v0, s4 @@ -8449,7 +18179,7 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -8476,7 +18206,7 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: .LBB40_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8507,7 +18237,7 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -8529,9 +18259,9 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB40_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -8578,7 +18308,7 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: .LBB40_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 @@ -8619,321 +18349,1158 @@ end: ret <40 x i16> %phi } +define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i64_to_v40i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_readfirstlane_b32 s7, v5 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v6 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s28 +; SI-NEXT: v_mov_b32_e32 v5, s26 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s18 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s29, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s27, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s25, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s23, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s21, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s17, v10, 16 +; SI-NEXT: s_lshr_b32 s12, s6, 16 +; SI-NEXT: s_lshr_b32 s13, s8, 16 +; SI-NEXT: s_lshr_b32 s14, s10, 16 +; SI-NEXT: s_lshr_b32 s15, s29, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s11, s11, 3 +; SI-NEXT: s_addc_u32 s10, s10, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s28 +; SI-NEXT: v_mov_b32_e32 v5, s26 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s18 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s29, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s27, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s25, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s23, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s21, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s17, v10, 16 +; SI-NEXT: s_lshr_b32 s12, s6, 16 +; SI-NEXT: s_lshr_b32 s13, s8, 16 +; SI-NEXT: s_lshr_b32 s14, s10, 16 +; SI-NEXT: s_lshr_b32 s15, s29, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v10i64_to_v40i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_readfirstlane_b32 s11, v0 +; VI-NEXT: v_readfirstlane_b32 s10, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v2 +; VI-NEXT: v_readfirstlane_b32 s8, v3 +; VI-NEXT: v_readfirstlane_b32 s6, v4 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v5 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s12, s7, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 16 +; VI-NEXT: s_lshr_b32 s14, s8, 16 +; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s40, s10, 16 +; VI-NEXT: s_lshr_b32 s41, s11, 16 +; VI-NEXT: s_lshr_b32 s42, s29, 16 +; VI-NEXT: s_lshr_b32 s43, s28, 16 +; VI-NEXT: s_lshr_b32 s44, s27, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s57, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s21, 16 +; VI-NEXT: s_lshr_b32 s59, s20, 16 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s62, s17, 16 +; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s12, s7, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 16 +; VI-NEXT: s_lshr_b32 s14, s8, 16 +; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s40, s10, 16 +; VI-NEXT: s_lshr_b32 s41, s11, 16 +; VI-NEXT: s_lshr_b32 s42, s29, 16 +; VI-NEXT: s_lshr_b32 s43, s28, 16 +; VI-NEXT: s_lshr_b32 s44, s27, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s57, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s21, 16 +; VI-NEXT: s_lshr_b32 s59, s20, 16 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s62, s17, 16 +; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s63, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s62, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s61, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s60, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s59, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s58, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s57, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s56, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s47, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s46, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s45, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s44, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s43, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_or_b32 s9, s9, s15 +; VI-NEXT: s_or_b32 s8, s8, s14 +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: s_or_b32 s7, s7, s12 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v17, s8 +; VI-NEXT: v_mov_b32_e32 v18, s6 +; VI-NEXT: v_mov_b32_e32 v19, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v10i64_to_v40i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_lshr_b32 s14, s9, 16 +; GFX9-NEXT: s_lshr_b32 s15, s8, 16 +; GFX9-NEXT: s_lshr_b32 s40, s7, 16 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s42, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s44, s27, 16 +; GFX9-NEXT: s_lshr_b32 s45, s26, 16 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: s_lshr_b32 s47, s24, 16 +; GFX9-NEXT: s_lshr_b32 s56, s23, 16 +; GFX9-NEXT: s_lshr_b32 s57, s22, 16 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s20, 16 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s62, s17, 16 +; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_lshr_b32 s14, s9, 16 +; GFX9-NEXT: s_lshr_b32 s15, s8, 16 +; GFX9-NEXT: s_lshr_b32 s40, s7, 16 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s42, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s44, s27, 16 +; GFX9-NEXT: s_lshr_b32 s45, s26, 16 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: s_lshr_b32 s47, s24, 16 +; GFX9-NEXT: s_lshr_b32 s56, s23, 16 +; GFX9-NEXT: s_lshr_b32 s57, s22, 16 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s20, 16 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s62, s17, 16 +; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-TRUE16-LABEL: bitcast_v10i64_to_v40i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-TRUE16-NEXT: .LBB41_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s22, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s23, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB41_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB41_2 +; +; GFX11-FAKE16-LABEL: bitcast_v10i64_to_v40i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s58, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-FAKE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-FAKE16-NEXT: .LBB41_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s25, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s26, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s27, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s28, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s29, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB41_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB41_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i64> %a, splat (i64 3) + %a2 = bitcast <10 x i64> %a1 to <40 x i16> + br label %end + +cmp.false: + %a3 = bitcast <10 x i64> %a to <40 x i16> + br label %end + +end: + %phi = phi <40 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i16> %phi +} + define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i16_to_v10i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v45 -; GCN-NEXT: v_or_b32_e32 v1, v1, v46 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v44 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v43 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v42 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v41 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v20 -; GCN-NEXT: v_or_b32_e32 v13, v13, v47 -; GCN-NEXT: v_or_b32_e32 v14, v14, v56 -; GCN-NEXT: v_or_b32_e32 v15, v15, v57 -; GCN-NEXT: v_or_b32_e32 v16, v16, v58 -; GCN-NEXT: v_or_b32_e32 v17, v17, v59 -; GCN-NEXT: v_or_b32_e32 v18, v18, v60 -; GCN-NEXT: v_or_b32_e32 v19, v19, v61 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v0, v45, v0 -; GCN-NEXT: v_or_b32_e32 v1, v46, v1 -; GCN-NEXT: v_or_b32_e32 v2, v44, v2 -; GCN-NEXT: v_or_b32_e32 v3, v43, v3 -; GCN-NEXT: v_or_b32_e32 v4, v42, v4 -; GCN-NEXT: v_or_b32_e32 v5, v41, v5 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v20, v6 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v20, v7 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v20, v8 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v20, v9 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v20, v10 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v20, v11 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v20, v12 -; GCN-NEXT: v_or_b32_e32 v13, v47, v13 -; GCN-NEXT: v_or_b32_e32 v14, v56, v14 -; GCN-NEXT: v_or_b32_e32 v15, v57, v15 -; GCN-NEXT: v_or_b32_e32 v16, v58, v16 -; GCN-NEXT: v_or_b32_e32 v17, v59, v17 -; GCN-NEXT: v_or_b32_e32 v18, v60, v18 -; GCN-NEXT: v_or_b32_e32 v19, v61, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40i16_to_v10i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v43 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v40 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v63 +; SI-NEXT: v_or_b32_e32 v9, v9, v62 +; SI-NEXT: v_or_b32_e32 v10, v10, v61 +; SI-NEXT: v_or_b32_e32 v16, v16, v47 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: v_or_b32_e32 v12, v12, v59 +; SI-NEXT: v_or_b32_e32 v13, v13, v58 +; SI-NEXT: v_or_b32_e32 v14, v14, v57 +; SI-NEXT: v_or_b32_e32 v15, v15, v56 +; SI-NEXT: v_or_b32_e32 v17, v17, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v45 +; SI-NEXT: v_or_b32_e32 v19, v19, v44 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_or_b32_e32 v10, v61, v10 +; SI-NEXT: v_or_b32_e32 v16, v47, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v11, v60, v11 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_or_b32_e32 v13, v58, v13 +; SI-NEXT: v_or_b32_e32 v14, v57, v14 +; SI-NEXT: v_or_b32_e32 v15, v56, v15 +; SI-NEXT: v_or_b32_e32 v17, v46, v17 +; SI-NEXT: v_or_b32_e32 v18, v45, v18 +; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i16_to_v10i64: ; VI: ; %bb.0: @@ -8966,7 +19533,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v19, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v19, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -9029,9 +19596,9 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB21_2: ; %Flow +; VI-NEXT: .LBB42_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_4 +; VI-NEXT: s_cbranch_execz .LBB42_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v19, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v43 @@ -9094,7 +19661,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v20, 3, v32 ; VI-NEXT: v_add_u16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: .LBB21_4: ; %end +; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -9174,7 +19741,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; kill: killed $vgpr20 @@ -9253,9 +19820,9 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: .LBB21_2: ; %Flow +; GFX9-NEXT: .LBB42_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -9314,7 +19881,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_4: ; %end +; GFX9-NEXT: .LBB42_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -9343,7 +19910,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -9365,7 +19932,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9417,7 +19984,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -9439,7 +20006,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9459,381 +20026,1184 @@ end: ret <10 x i64> %phi } +define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i16_to_v10i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v24 +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v4 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v7, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v8, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v10, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v18, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v19, v0, v53 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v40i16_to_v10i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v34, v3 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v37, v1 +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v14, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v37, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v37 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v40i16_to_v10i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v5 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v34, v3 +; GFX9-NEXT: v_mov_b32_e32 v35, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v33, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v13, s76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB43_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v10i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB43_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i16> %a, splat (i16 3) + %a2 = bitcast <40 x i16> %a1 to <10 x i64> + br label %end + +cmp.false: + %a3 = bitcast <40 x i16> %a to <10 x i64> + br label %end + +end: + %phi = phi <10 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i64> %phi +} + define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i64_to_v40f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v56 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v42 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v40 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v54 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v52 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v50 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v20, v19 -; GCN-NEXT: v_or_b32_e32 v13, v38, v50 -; GCN-NEXT: v_or_b32_e32 v14, v36, v54 -; GCN-NEXT: v_or_b32_e32 v15, v34, v41 -; GCN-NEXT: v_or_b32_e32 v16, v32, v55 -; GCN-NEXT: v_or_b32_e32 v17, v30, v53 -; GCN-NEXT: v_or_b32_e32 v19, v28, v51 -; GCN-NEXT: v_or_b32_e32 v20, v27, v49 -; GCN-NEXT: v_or_b32_e32 v26, v26, v39 -; GCN-NEXT: v_or_b32_e32 v25, v25, v37 -; GCN-NEXT: v_or_b32_e32 v24, v24, v35 -; GCN-NEXT: v_or_b32_e32 v23, v23, v33 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_or_b32_e32 v21, v21, v29 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10i64_to_v40f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i64_to_v40f16: ; VI: ; %bb.0: @@ -9861,7 +21231,7 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -9883,9 +21253,9 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB22_2: ; %Flow +; VI-NEXT: .LBB44_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_4 +; VI-NEXT: s_cbranch_execz .LBB44_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc @@ -9927,7 +21297,7 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB22_4: ; %end +; VI-NEXT: .LBB44_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 @@ -9997,7 +21367,7 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -10019,9 +21389,9 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB22_2: ; %Flow +; GFX9-NEXT: .LBB44_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_4 +; GFX9-NEXT: s_cbranch_execz .LBB44_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc @@ -10063,7 +21433,7 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB22_4: ; %end +; GFX9-NEXT: .LBB44_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v39, v0, s4 @@ -10096,7 +21466,7 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -10123,7 +21493,7 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: .LBB44_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10154,7 +21524,7 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -10176,9 +21546,9 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB44_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -10225,7 +21595,7 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: .LBB44_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 @@ -10266,405 +21636,1371 @@ end: ret <40 x half> %phi } +define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i64_to_v40f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v1 +; SI-NEXT: v_readfirstlane_b32 s11, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v6 +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s12, s4, 16 +; SI-NEXT: s_lshr_b32 s13, s5, 16 +; SI-NEXT: s_add_u32 s14, s18, 3 +; SI-NEXT: s_addc_u32 s15, s19, 0 +; SI-NEXT: s_lshr_b32 s16, s14, 16 +; SI-NEXT: s_lshr_b32 s17, s15, 16 +; SI-NEXT: s_add_u32 s18, s20, 3 +; SI-NEXT: s_addc_u32 s19, s21, 0 +; SI-NEXT: s_lshr_b32 s20, s18, 16 +; SI-NEXT: s_lshr_b32 s21, s19, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s41, s23, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s42, s24, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s44, s26, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s46, s28, 16 +; SI-NEXT: s_lshr_b32 s47, s29, 16 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_lshr_b32 s56, s10, 16 +; SI-NEXT: s_lshr_b32 s57, s11, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s58, s7, 16 +; SI-NEXT: s_lshr_b32 s59, s8, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_lshr_b32 s60, s6, 16 +; SI-NEXT: s_lshr_b32 s61, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s12 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v39, v39, v48 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v10i64_to_v40f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_readfirstlane_b32 s11, v0 +; VI-NEXT: v_readfirstlane_b32 s10, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v2 +; VI-NEXT: v_readfirstlane_b32 s8, v3 +; VI-NEXT: v_readfirstlane_b32 s6, v4 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v5 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s12, s7, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 16 +; VI-NEXT: s_lshr_b32 s14, s8, 16 +; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s40, s10, 16 +; VI-NEXT: s_lshr_b32 s41, s11, 16 +; VI-NEXT: s_lshr_b32 s42, s29, 16 +; VI-NEXT: s_lshr_b32 s43, s28, 16 +; VI-NEXT: s_lshr_b32 s44, s27, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s57, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s21, 16 +; VI-NEXT: s_lshr_b32 s59, s20, 16 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s62, s17, 16 +; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s12, s7, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 16 +; VI-NEXT: s_lshr_b32 s14, s8, 16 +; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s40, s10, 16 +; VI-NEXT: s_lshr_b32 s41, s11, 16 +; VI-NEXT: s_lshr_b32 s42, s29, 16 +; VI-NEXT: s_lshr_b32 s43, s28, 16 +; VI-NEXT: s_lshr_b32 s44, s27, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s57, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s21, 16 +; VI-NEXT: s_lshr_b32 s59, s20, 16 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s62, s17, 16 +; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s63, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s62, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s61, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s60, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s59, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s58, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s57, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s56, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s47, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s46, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s45, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s44, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s43, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_or_b32 s9, s9, s15 +; VI-NEXT: s_or_b32 s8, s8, s14 +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: s_or_b32 s7, s7, s12 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v17, s8 +; VI-NEXT: v_mov_b32_e32 v18, s6 +; VI-NEXT: v_mov_b32_e32 v19, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v10i64_to_v40f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_lshr_b32 s14, s9, 16 +; GFX9-NEXT: s_lshr_b32 s15, s8, 16 +; GFX9-NEXT: s_lshr_b32 s40, s7, 16 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s42, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s44, s27, 16 +; GFX9-NEXT: s_lshr_b32 s45, s26, 16 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: s_lshr_b32 s47, s24, 16 +; GFX9-NEXT: s_lshr_b32 s56, s23, 16 +; GFX9-NEXT: s_lshr_b32 s57, s22, 16 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s20, 16 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s62, s17, 16 +; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_lshr_b32 s14, s9, 16 +; GFX9-NEXT: s_lshr_b32 s15, s8, 16 +; GFX9-NEXT: s_lshr_b32 s40, s7, 16 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s42, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s44, s27, 16 +; GFX9-NEXT: s_lshr_b32 s45, s26, 16 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: s_lshr_b32 s47, s24, 16 +; GFX9-NEXT: s_lshr_b32 s56, s23, 16 +; GFX9-NEXT: s_lshr_b32 s57, s22, 16 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s20, 16 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s62, s17, 16 +; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-TRUE16-LABEL: bitcast_v10i64_to_v40f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-TRUE16-NEXT: .LBB45_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s22, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s23, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB45_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB45_2 +; +; GFX11-FAKE16-LABEL: bitcast_v10i64_to_v40f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s58, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-FAKE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-FAKE16-NEXT: .LBB45_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s25, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s26, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s27, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s28, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s29, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB45_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB45_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i64> %a, splat (i64 3) + %a2 = bitcast <10 x i64> %a1 to <40 x half> + br label %end + +cmp.false: + %a3 = bitcast <10 x i64> %a to <40 x half> + br label %end + +end: + %phi = phi <40 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x half> %phi +} + define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v40f16_to_v10i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; GCN-NEXT: v_or_b32_e32 v0, v38, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; GCN-NEXT: v_or_b32_e32 v2, v34, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; GCN-NEXT: v_or_b32_e32 v4, v62, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; GCN-NEXT: v_or_b32_e32 v5, v60, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v50 -; GCN-NEXT: v_or_b32_e32 v6, v46, v6 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v8, v42, v8 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v20, v9 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v20, v10 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v20, v11 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v20, v12 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v20, v13 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v20, v14 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v20, v15 -; GCN-NEXT: v_or_b32_e32 v16, v53, v16 -; GCN-NEXT: v_or_b32_e32 v17, v51, v17 -; GCN-NEXT: v_or_b32_e32 v18, v49, v18 -; GCN-NEXT: v_or_b32_e32 v19, v48, v19 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v34 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v32 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v62 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v60 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v56 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v47 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v45 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v43 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v41 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v40 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v48 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_or_b32_e32 v7, v9, v8 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v19, v18 -; GCN-NEXT: v_or_b32_e32 v13, v21, v20 -; GCN-NEXT: v_or_b32_e32 v14, v23, v22 -; GCN-NEXT: v_or_b32_e32 v15, v25, v24 -; GCN-NEXT: v_or_b32_e32 v16, v27, v26 -; GCN-NEXT: v_or_b32_e32 v17, v29, v28 -; GCN-NEXT: v_or_b32_e32 v18, v31, v30 -; GCN-NEXT: v_or_b32_e32 v19, v33, v32 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40f16_to_v10i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_or_b32_e32 v9, v44, v9 +; SI-NEXT: v_or_b32_e32 v10, v42, v10 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; SI-NEXT: v_or_b32_e32 v18, v50, v18 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40f16_to_v10i64: ; VI: ; %bb.0: @@ -10697,7 +23033,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v19, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v19, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -10760,9 +23096,9 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB23_2: ; %Flow +; VI-NEXT: .LBB46_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_4 +; VI-NEXT: s_cbranch_execz .LBB46_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v19, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v43, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -10825,7 +23161,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: .LBB23_4: ; %end +; VI-NEXT: .LBB46_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -10905,7 +23241,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; kill: killed $vgpr20 @@ -10984,9 +23320,9 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: .LBB23_2: ; %Flow +; GFX9-NEXT: .LBB46_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -11046,7 +23382,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_4: ; %end +; GFX9-NEXT: .LBB46_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -11075,7 +23411,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -11097,7 +23433,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11149,7 +23485,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -11171,9 +23507,930 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <40 x half> %a, splat (half 0xH0200) + %a2 = bitcast <40 x half> %a1 to <10 x i64> + br label %end + +cmp.false: + %a3 = bitcast <40 x half> %a to <10 x i64> + br label %end + +end: + %phi = phi <10 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i64> %phi +} + +define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40f16_to_v10i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_or_b32_e32 v11, v55, v11 +; SI-NEXT: v_or_b32_e32 v12, v53, v12 +; SI-NEXT: v_or_b32_e32 v13, v51, v13 +; SI-NEXT: v_or_b32_e32 v14, v49, v14 +; SI-NEXT: v_or_b32_e32 v15, v31, v15 +; SI-NEXT: v_or_b32_e32 v16, v29, v16 +; SI-NEXT: v_or_b32_e32 v17, v27, v17 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_mov_b32_e32 v48, v21 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: v_mov_b32_e32 v49, v20 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v50, v22 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v51 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_mov_b32_e32 v45, v52 +; SI-NEXT: v_mov_b32_e32 v52, v27 +; SI-NEXT: v_mov_b32_e32 v46, v53 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v32 +; SI-NEXT: v_mov_b32_e32 v33, v47 +; SI-NEXT: v_mov_b32_e32 v47, v54 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: v_mov_b32_e32 v42, v56 +; SI-NEXT: v_mov_b32_e32 v56, v55 +; SI-NEXT: v_mov_b32_e32 v55, v30 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v36, v57 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v58 +; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v32, v38 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: v_mov_b32_e32 v35, v25 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v25, v35 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v32 +; SI-NEXT: v_mov_b32_e32 v24, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v58 +; SI-NEXT: v_mov_b32_e32 v58, v39 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v57 +; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: v_mov_b32_e32 v56, v42 +; SI-NEXT: v_mov_b32_e32 v32, v41 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v51 +; SI-NEXT: v_mov_b32_e32 v51, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v20, v49 +; SI-NEXT: v_mov_b32_e32 v49, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v21, v48 +; SI-NEXT: v_mov_b32_e32 v48, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v54 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v28, v53 +; SI-NEXT: v_mov_b32_e32 v53, v46 +; SI-NEXT: v_mov_b32_e32 v27, v52 +; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v40f16_to_v10i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v32, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v34, v3 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v36, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB47_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v19, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v37, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v36, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v35, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v18, v20, v18 +; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v40f16_to_v10i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v5 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v34, v3 +; GFX9-NEXT: v_mov_b32_e32 v35, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB47_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB47_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v33, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v13, s76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB47_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v10i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB47_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -11192,191 +24449,210 @@ end: } define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f64_to_v40i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v24, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v26, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v37, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v38, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v24, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v26, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v37, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v38, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v38, v49, v38 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v48, v50, v48 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v3, v3, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 24, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v4, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v5, v5, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 32, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v6, v6, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v7, v7, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 40, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v8, v8, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v9, v9, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v10, v10, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 52, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v11, v11, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 56, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v12, v12, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v13, v13, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v14, v14, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v15, v15, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v16, v16, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x4c, v0 -; GCN-NEXT: v_or_b32_e32 v17, v17, v22 -; GCN-NEXT: v_or_b32_e32 v18, v18, v30 -; GCN-NEXT: v_or_b32_e32 v19, v19, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f64_to_v40i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v30, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v30, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f64_to_v40i16: ; VI: ; %bb.0: @@ -11404,7 +24680,7 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -11426,9 +24702,9 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -11460,7 +24736,7 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 @@ -11530,7 +24806,7 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -11552,9 +24828,9 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -11586,7 +24862,7 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v39, v0, s4 @@ -11619,7 +24895,7 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -11631,22 +24907,901 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: .LBB48_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <10 x double> %a1 to <40 x i16> + br label %end + +cmp.false: + %a3 = bitcast <10 x double> %a to <40 x i16> + br label %end + +end: + %phi = phi <40 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i16> %phi +} + +define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f64_to_v40i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_mov_b32_e32 v19, s16 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_mov_b32_e32 v17, s18 +; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: v_mov_b32_e32 v16, s21 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v7, s28 +; SI-NEXT: v_mov_b32_e32 v8, s29 +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v23, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v24, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v28, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v30, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v37, v20, v19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v23, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v24, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v28, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v30, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v37, v20, v19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v19, v19, v37 +; SI-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v48 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v35 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v10f64_to_v40i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v20, s16 +; VI-NEXT: v_mov_b32_e32 v21, s17 +; VI-NEXT: v_mov_b32_e32 v18, s18 +; VI-NEXT: v_mov_b32_e32 v19, s19 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v14, s22 +; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v8, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; VI-NEXT: v_or_b32_sdwa v24, v20, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_or_b32_sdwa v25, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v39 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_or_b32_sdwa v21, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; VI-NEXT: v_or_b32_sdwa v22, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; VI-NEXT: v_or_b32_sdwa v7, v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; VI-NEXT: v_or_b32_sdwa v23, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v20 +; VI-NEXT: v_mov_b32_e32 v3, v21 +; VI-NEXT: v_mov_b32_e32 v4, v22 +; VI-NEXT: v_mov_b32_e32 v5, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v10f64_to_v40i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_mov_b32_e32 v21, s17 +; GFX9-NEXT: v_mov_b32_e32 v18, s18 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v14, s22 +; GFX9-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v15, v30, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v29, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v20, v39, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v17, v28, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v21, v38, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v18, v27, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v8, v37, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v36, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v35, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v34, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v33, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v32, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v19, v26, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-NEXT: v_mov_b32_e32 v3, v21 +; GFX9-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: s_branch .LBB49_2 +; +; GFX11-TRUE16-LABEL: bitcast_v10f64_to_v40i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, s0 :: v_dual_mov_b32 v20, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, s16 :: v_dual_mov_b32 v22, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v18, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-TRUE16-NEXT: .LBB49_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v35, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v36, 16, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v33, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v32, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v14, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v27, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v26, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v25, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v24, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v23, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v21 :: v_dual_mov_b32 v0, v20 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 ; -; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40i16: +; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40i16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: .LBB49_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v36, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v32, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v35, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v28, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v27, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v25, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v20 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB49_4: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr9 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 @@ -11657,91 +25812,7 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -11760,320 +25831,317 @@ end: } define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i16_to_v10f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v45 -; GCN-NEXT: v_or_b32_e32 v1, v1, v46 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v44 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v43 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v42 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v41 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v20 -; GCN-NEXT: v_or_b32_e32 v13, v13, v47 -; GCN-NEXT: v_or_b32_e32 v14, v14, v56 -; GCN-NEXT: v_or_b32_e32 v15, v15, v57 -; GCN-NEXT: v_or_b32_e32 v16, v16, v58 -; GCN-NEXT: v_or_b32_e32 v17, v17, v59 -; GCN-NEXT: v_or_b32_e32 v18, v18, v60 -; GCN-NEXT: v_or_b32_e32 v19, v19, v61 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v0, v45, v0 -; GCN-NEXT: v_or_b32_e32 v1, v46, v1 -; GCN-NEXT: v_or_b32_e32 v2, v44, v2 -; GCN-NEXT: v_or_b32_e32 v3, v43, v3 -; GCN-NEXT: v_or_b32_e32 v4, v42, v4 -; GCN-NEXT: v_or_b32_e32 v5, v41, v5 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v20, v6 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v20, v7 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v20, v8 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v20, v9 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v20, v10 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v20, v11 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v20, v12 -; GCN-NEXT: v_or_b32_e32 v13, v47, v13 -; GCN-NEXT: v_or_b32_e32 v14, v56, v14 -; GCN-NEXT: v_or_b32_e32 v15, v57, v15 -; GCN-NEXT: v_or_b32_e32 v16, v58, v16 -; GCN-NEXT: v_or_b32_e32 v17, v59, v17 -; GCN-NEXT: v_or_b32_e32 v18, v60, v18 -; GCN-NEXT: v_or_b32_e32 v19, v61, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40i16_to_v10f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v43 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v40 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v63 +; SI-NEXT: v_or_b32_e32 v9, v9, v62 +; SI-NEXT: v_or_b32_e32 v10, v10, v61 +; SI-NEXT: v_or_b32_e32 v16, v16, v47 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: v_or_b32_e32 v12, v12, v59 +; SI-NEXT: v_or_b32_e32 v13, v13, v58 +; SI-NEXT: v_or_b32_e32 v14, v14, v57 +; SI-NEXT: v_or_b32_e32 v15, v15, v56 +; SI-NEXT: v_or_b32_e32 v17, v17, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v45 +; SI-NEXT: v_or_b32_e32 v19, v19, v44 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_or_b32_e32 v10, v61, v10 +; SI-NEXT: v_or_b32_e32 v16, v47, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v11, v60, v11 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_or_b32_e32 v13, v58, v13 +; SI-NEXT: v_or_b32_e32 v14, v57, v14 +; SI-NEXT: v_or_b32_e32 v15, v56, v15 +; SI-NEXT: v_or_b32_e32 v17, v46, v17 +; SI-NEXT: v_or_b32_e32 v18, v45, v18 +; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i16_to_v10f64: ; VI: ; %bb.0: @@ -12106,7 +26174,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v19, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v19, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -12169,9 +26237,9 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v19, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v43 @@ -12234,7 +26302,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v20, 3, v32 ; VI-NEXT: v_add_u16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -12314,7 +26382,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; kill: killed $vgpr20 @@ -12393,9 +26461,9 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: .LBB25_2: ; %Flow +; GFX9-NEXT: .LBB50_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 +; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -12454,7 +26522,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: .LBB50_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -12483,7 +26551,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -12505,7 +26573,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12557,7 +26625,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -12579,7 +26647,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -12599,361 +26667,1164 @@ end: ret <10 x double> %phi } +define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i16_to_v10f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v24 +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v4 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v7, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v8, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v10, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v18, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v19, v0, v53 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v40i16_to_v10f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v34, v3 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v37, v1 +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v14, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v37, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v37 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v40i16_to_v10f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v5 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v34, v3 +; GFX9-NEXT: v_mov_b32_e32 v35, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v33, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v13, s76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v10f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i16> %a, splat (i16 3) + %a2 = bitcast <40 x i16> %a1 to <10 x double> + br label %end + +cmp.false: + %a3 = bitcast <40 x i16> %a to <10 x double> + br label %end + +end: + %phi = phi <10 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x double> %phi +} + define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f64_to_v40f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v56 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v42 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v40 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v54 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v52 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v50 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v20, v19 -; GCN-NEXT: v_or_b32_e32 v13, v38, v50 -; GCN-NEXT: v_or_b32_e32 v14, v36, v54 -; GCN-NEXT: v_or_b32_e32 v15, v34, v41 -; GCN-NEXT: v_or_b32_e32 v16, v32, v55 -; GCN-NEXT: v_or_b32_e32 v17, v30, v53 -; GCN-NEXT: v_or_b32_e32 v19, v28, v51 -; GCN-NEXT: v_or_b32_e32 v20, v27, v49 -; GCN-NEXT: v_or_b32_e32 v26, v26, v39 -; GCN-NEXT: v_or_b32_e32 v25, v25, v37 -; GCN-NEXT: v_or_b32_e32 v24, v24, v35 -; GCN-NEXT: v_or_b32_e32 v23, v23, v33 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_or_b32_e32 v21, v21, v29 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f64_to_v40f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f64_to_v40f16: ; VI: ; %bb.0: @@ -12981,7 +27852,7 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -13003,9 +27874,9 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB26_2: ; %Flow +; VI-NEXT: .LBB52_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_4 +; VI-NEXT: s_cbranch_execz .LBB52_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -13037,7 +27908,7 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB26_4: ; %end +; VI-NEXT: .LBB52_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 @@ -13107,7 +27978,7 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -13129,9 +28000,9 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB26_2: ; %Flow +; GFX9-NEXT: .LBB52_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_4 +; GFX9-NEXT: s_cbranch_execz .LBB52_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -13163,7 +28034,7 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB26_4: ; %end +; GFX9-NEXT: .LBB52_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v39, v0, s4 @@ -13196,7 +28067,7 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -13208,7 +28079,7 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13239,7 +28110,7 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -13261,9 +28132,9 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -13295,7 +28166,7 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_4: ; %end +; GFX11-FAKE16-NEXT: .LBB52_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 @@ -13336,405 +28207,1351 @@ end: ret <40 x half> %phi } +define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f64_to_v40f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: v_readfirstlane_b32 s9, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: s_and_b64 s[10:11], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v6 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s10, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: s_lshr_b32 s10, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 +; SI-NEXT: s_lshr_b32 s10, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 +; SI-NEXT: s_lshr_b32 s10, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 +; SI-NEXT: s_lshr_b32 s10, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 +; SI-NEXT: s_lshr_b32 s10, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 +; SI-NEXT: s_lshr_b32 s10, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: s_lshr_b32 s10, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 +; SI-NEXT: s_lshr_b32 s10, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s10 +; SI-NEXT: s_lshr_b32 s10, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 +; SI-NEXT: s_lshr_b32 s10, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 +; SI-NEXT: s_lshr_b32 s10, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s10 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 +; SI-NEXT: s_lshr_b32 s10, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s10 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s10 +; SI-NEXT: s_lshr_b32 s10, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[34:35], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[29:30], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[25:26], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[21:22], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[19:20], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[14:15], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v39, v39, v48 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v10f64_to_v40f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v20, s16 +; VI-NEXT: v_mov_b32_e32 v21, s17 +; VI-NEXT: v_mov_b32_e32 v18, s18 +; VI-NEXT: v_mov_b32_e32 v19, s19 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v14, s22 +; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v8, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; VI-NEXT: v_or_b32_sdwa v24, v20, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_or_b32_sdwa v25, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v39 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_or_b32_sdwa v21, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; VI-NEXT: v_or_b32_sdwa v22, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; VI-NEXT: v_or_b32_sdwa v7, v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; VI-NEXT: v_or_b32_sdwa v23, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v20 +; VI-NEXT: v_mov_b32_e32 v3, v21 +; VI-NEXT: v_mov_b32_e32 v4, v22 +; VI-NEXT: v_mov_b32_e32 v5, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v10f64_to_v40f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_mov_b32_e32 v21, s17 +; GFX9-NEXT: v_mov_b32_e32 v18, s18 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v14, s22 +; GFX9-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v15, v30, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v29, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v20, v39, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v17, v28, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v21, v38, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v18, v27, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v8, v37, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v36, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v35, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v34, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v33, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v32, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v19, v26, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-NEXT: v_mov_b32_e32 v3, v21 +; GFX9-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-TRUE16-LABEL: bitcast_v10f64_to_v40f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, s0 :: v_dual_mov_b32 v20, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, s16 :: v_dual_mov_b32 v22, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v18, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-TRUE16-NEXT: .LBB53_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v35, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v36, 16, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v33, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v32, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v14, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v27, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v26, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v25, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v24, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v23, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v21 :: v_dual_mov_b32 v0, v20 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB53_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB53_2 +; +; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: .LBB53_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v36, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v32, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v35, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v28, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v27, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v25, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v20 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB53_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <10 x double> %a1 to <40 x half> + br label %end + +cmp.false: + %a3 = bitcast <10 x double> %a to <40 x half> + br label %end + +end: + %phi = phi <40 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x half> %phi +} + define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v40f16_to_v10f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; GCN-NEXT: v_or_b32_e32 v0, v38, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; GCN-NEXT: v_or_b32_e32 v2, v34, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; GCN-NEXT: v_or_b32_e32 v4, v62, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; GCN-NEXT: v_or_b32_e32 v5, v60, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v50 -; GCN-NEXT: v_or_b32_e32 v6, v46, v6 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v8, v42, v8 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v20, v9 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v20, v10 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v20, v11 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v20, v12 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v20, v13 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v20, v14 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v20, v15 -; GCN-NEXT: v_or_b32_e32 v16, v53, v16 -; GCN-NEXT: v_or_b32_e32 v17, v51, v17 -; GCN-NEXT: v_or_b32_e32 v18, v49, v18 -; GCN-NEXT: v_or_b32_e32 v19, v48, v19 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v34 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v32 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v62 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v60 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v56 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v47 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v45 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v43 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v41 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v40 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v48 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_or_b32_e32 v7, v9, v8 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v19, v18 -; GCN-NEXT: v_or_b32_e32 v13, v21, v20 -; GCN-NEXT: v_or_b32_e32 v14, v23, v22 -; GCN-NEXT: v_or_b32_e32 v15, v25, v24 -; GCN-NEXT: v_or_b32_e32 v16, v27, v26 -; GCN-NEXT: v_or_b32_e32 v17, v29, v28 -; GCN-NEXT: v_or_b32_e32 v18, v31, v30 -; GCN-NEXT: v_or_b32_e32 v19, v33, v32 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40f16_to_v10f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_or_b32_e32 v9, v44, v9 +; SI-NEXT: v_or_b32_e32 v10, v42, v10 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; SI-NEXT: v_or_b32_e32 v18, v50, v18 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40f16_to_v10f64: ; VI: ; %bb.0: @@ -13767,7 +29584,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v19, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v19, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -13830,9 +29647,9 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: .LBB54_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_4 +; VI-NEXT: s_cbranch_execz .LBB54_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v19, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v43, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -13895,7 +29712,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: .LBB27_4: ; %end +; VI-NEXT: .LBB54_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -13975,7 +29792,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; kill: killed $vgpr20 @@ -14054,9 +29871,9 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: .LBB54_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -14116,7 +29933,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_4: ; %end +; GFX9-NEXT: .LBB54_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -14145,7 +29962,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -14167,7 +29984,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: .LBB54_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -14219,7 +30036,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -14241,7 +30058,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB27_2: ; %end +; GFX11-FAKE16-NEXT: .LBB54_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -14261,513 +30078,1448 @@ end: ret <10 x double> %phi } +define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40f16_to_v10f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_or_b32_e32 v11, v55, v11 +; SI-NEXT: v_or_b32_e32 v12, v53, v12 +; SI-NEXT: v_or_b32_e32 v13, v51, v13 +; SI-NEXT: v_or_b32_e32 v14, v49, v14 +; SI-NEXT: v_or_b32_e32 v15, v31, v15 +; SI-NEXT: v_or_b32_e32 v16, v29, v16 +; SI-NEXT: v_or_b32_e32 v17, v27, v17 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_mov_b32_e32 v48, v21 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: v_mov_b32_e32 v49, v20 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v50, v22 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v51 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_mov_b32_e32 v45, v52 +; SI-NEXT: v_mov_b32_e32 v52, v27 +; SI-NEXT: v_mov_b32_e32 v46, v53 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v32 +; SI-NEXT: v_mov_b32_e32 v33, v47 +; SI-NEXT: v_mov_b32_e32 v47, v54 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: v_mov_b32_e32 v42, v56 +; SI-NEXT: v_mov_b32_e32 v56, v55 +; SI-NEXT: v_mov_b32_e32 v55, v30 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v36, v57 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v58 +; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v32, v38 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: v_mov_b32_e32 v35, v25 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v25, v35 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v32 +; SI-NEXT: v_mov_b32_e32 v24, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v58 +; SI-NEXT: v_mov_b32_e32 v58, v39 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v57 +; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: v_mov_b32_e32 v56, v42 +; SI-NEXT: v_mov_b32_e32 v32, v41 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v51 +; SI-NEXT: v_mov_b32_e32 v51, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v20, v49 +; SI-NEXT: v_mov_b32_e32 v49, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v21, v48 +; SI-NEXT: v_mov_b32_e32 v48, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v54 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v28, v53 +; SI-NEXT: v_mov_b32_e32 v53, v46 +; SI-NEXT: v_mov_b32_e32 v27, v52 +; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v40f16_to_v10f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v32, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v34, v3 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v36, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v19, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v37, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v36, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v35, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v18, v20, v18 +; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v40f16_to_v10f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v5 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v34, v3 +; GFX9-NEXT: v_mov_b32_e32 v35, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v33, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v13, s76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB55_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB55_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB55_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v10f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB55_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB55_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB55_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <40 x half> %a, splat (half 0xH0200) + %a2 = bitcast <40 x half> %a1 to <10 x double> + br label %end + +cmp.false: + %a3 = bitcast <40 x half> %a to <10 x double> + br label %end + +end: + %phi = phi <10 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x double> %phi +} + define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i16_to_v40f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v39 -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v55 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v55 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v53 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v51 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v49 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v43 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v44 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v45 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v46 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v47 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v56 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v57 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v58 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v59 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v60 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v61 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v62 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v63 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v5, v5, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v20, v19 -; GCN-NEXT: v_or_b32_e32 v13, v23, v22 -; GCN-NEXT: v_or_b32_e32 v14, v26, v25 -; GCN-NEXT: v_or_b32_e32 v15, v29, v28 -; GCN-NEXT: v_or_b32_e32 v16, v39, v38 -; GCN-NEXT: v_or_b32_e32 v17, v50, v49 -; GCN-NEXT: v_or_b32_e32 v19, v53, v52 -; GCN-NEXT: v_or_b32_e32 v20, v55, v31 -; GCN-NEXT: v_or_b32_e32 v22, v44, v32 -; GCN-NEXT: v_or_b32_e32 v23, v46, v33 -; GCN-NEXT: v_or_b32_e32 v25, v56, v34 -; GCN-NEXT: v_or_b32_e32 v26, v40, v35 -; GCN-NEXT: v_or_b32_e32 v28, v41, v36 -; GCN-NEXT: v_or_b32_e32 v29, v42, v37 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40i16_to_v40f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v40, v48 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i16_to_v40f16: ; VI: ; %bb.0: @@ -14796,7 +31548,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v39, 3, v39 @@ -14838,7 +31590,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: v_add_u16_e32 v19, 3, v19 ; VI-NEXT: v_add_u16_e32 v21, 3, v21 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -14909,7 +31661,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v19, v39, v19, s6 @@ -14972,7 +31724,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v19 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v20, v0, s4 @@ -15005,7 +31757,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] @@ -15027,7 +31779,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: .LBB56_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15059,7 +31811,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v18, v39, v18, 0x5040100 @@ -15121,7 +31873,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v18 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: .LBB56_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v21, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v22, v1, 0x5040100 @@ -15161,370 +31913,1448 @@ end: ret <40 x half> %phi } +define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i16_to_v40f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, s18 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s21 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v40i16_to_v40f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s15, s18, s15 +; VI-NEXT: s_and_b32 s18, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s14, s18, s14 +; VI-NEXT: s_and_b32 s18, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s13, s18, s13 +; VI-NEXT: s_and_b32 s18, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s12, s18, s12 +; VI-NEXT: s_and_b32 s18, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s11, s18, s11 +; VI-NEXT: s_and_b32 s18, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; VI-NEXT: s_or_b32 s10, s18, s10 +; VI-NEXT: s_and_b32 s18, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_or_b32_sdwa v14, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; VI-NEXT: s_or_b32 s9, s18, s9 +; VI-NEXT: s_and_b32 s18, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; VI-NEXT: s_or_b32 s8, s18, s8 +; VI-NEXT: s_and_b32 s18, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; VI-NEXT: s_or_b32 s7, s18, s7 +; VI-NEXT: s_and_b32 s18, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; VI-NEXT: s_or_b32 s6, s18, s6 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v40i16_to_v40f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_pk_add_u16 v13, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_u16 v12, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_u16 v11, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_u16 v10, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_u16 v9, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_u16 v8, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_u16 v7, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_u16 v6, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_u16 v23, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_u16 v22, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_u16 v21, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pk_add_u16 v20, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v25, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: s_branch .LBB57_5 +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v23, s21 +; GFX9-NEXT: v_mov_b32_e32 v22, s20 +; GFX9-NEXT: v_mov_b32_e32 v21, s19 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_mov_b32_e32 v25, s17 +; GFX9-NEXT: v_mov_b32_e32 v24, s16 +; GFX9-NEXT: v_mov_b32_e32 v26, s43 +; GFX9-NEXT: v_mov_b32_e32 v27, s42 +; GFX9-NEXT: v_mov_b32_e32 v28, s41 +; GFX9-NEXT: v_mov_b32_e32 v29, s40 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v31, s14 +; GFX9-NEXT: v_mov_b32_e32 v32, s13 +; GFX9-NEXT: v_mov_b32_e32 v33, s12 +; GFX9-NEXT: v_mov_b32_e32 v34, s11 +; GFX9-NEXT: v_mov_b32_e32 v35, s10 +; GFX9-NEXT: v_mov_b32_e32 v36, s9 +; GFX9-NEXT: v_mov_b32_e32 v37, s8 +; GFX9-NEXT: v_mov_b32_e32 v38, s7 +; GFX9-NEXT: v_mov_b32_e32 v39, s6 +; GFX9-NEXT: .LBB57_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v35, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v34, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v32, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v31, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v30, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v29, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v27, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v26, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-NEXT: v_mov_b32_e32 v3, v21 +; GFX9-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v40f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s14, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s13, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s12, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s11, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s10, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v38.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v39.l +; GFX11-TRUE16-NEXT: s_branch .LBB57_5 +; GFX11-TRUE16-NEXT: .LBB57_3: +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s6 +; GFX11-TRUE16-NEXT: .LBB57_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v28, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v29, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v23, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v24, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v25, 16, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v35, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v36, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v37, 16, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v30, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v27, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v22, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v24, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v25, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v21 :: v_dual_mov_b32 v0, v20 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v40f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: s_branch .LBB57_5 +; GFX11-FAKE16-NEXT: .LBB57_3: +; GFX11-FAKE16-NEXT: s_branch .LBB57_2 +; GFX11-FAKE16-NEXT: .LBB57_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v10, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s25 :: v_dual_mov_b32 v12, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s23 :: v_dual_mov_b32 v14, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v6, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v2, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v21, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s43 :: v_dual_mov_b32 v25, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v27, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v33, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v35, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s5 +; GFX11-FAKE16-NEXT: .LBB57_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v36, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v31, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v30, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v25, 16, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v28, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v26, 16, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v22, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v21 :: v_dual_mov_b32 v0, v20 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i16> %a, splat (i16 3) + %a2 = bitcast <40 x i16> %a1 to <40 x half> + br label %end + +cmp.false: + %a3 = bitcast <40 x i16> %a to <40 x half> + br label %end + +end: + %phi = phi <40 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x half> %phi +} + define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v40f16_to_v40i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:40 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v40 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v4 -; GCN-NEXT: v_or_b32_e32 v9, v9, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v5 -; GCN-NEXT: v_or_b32_e32 v10, v10, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_or_b32_e32 v11, v11, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v7 -; GCN-NEXT: v_or_b32_e32 v12, v12, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_or_b32_e32 v13, v13, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v32 -; GCN-NEXT: v_or_b32_e32 v14, v14, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v31 -; GCN-NEXT: v_or_b32_e32 v53, v25, v24 -; GCN-NEXT: v_or_b32_e32 v51, v27, v26 -; GCN-NEXT: v_or_b32_e32 v49, v29, v28 -; GCN-NEXT: v_or_b32_e32 v39, v39, v30 -; GCN-NEXT: v_or_b32_e32 v37, v37, v48 -; GCN-NEXT: v_or_b32_e32 v36, v36, v38 -; GCN-NEXT: v_or_b32_e32 v21, v21, v23 -; GCN-NEXT: v_or_b32_e32 v19, v19, v22 -; GCN-NEXT: v_or_b32_e32 v17, v17, v20 -; GCN-NEXT: v_or_b32_e32 v16, v16, v18 -; GCN-NEXT: v_or_b32_e32 v15, v15, v54 -; GCN-NEXT: v_or_b32_e32 v35, v35, v52 -; GCN-NEXT: v_or_b32_e32 v34, v34, v55 -; GCN-NEXT: v_or_b32_e32 v33, v33, v50 -; GCN-NEXT: v_alignbit_b32 v55, v33, v24, 16 -; GCN-NEXT: v_alignbit_b32 v54, v34, v26, 16 -; GCN-NEXT: v_alignbit_b32 v52, v35, v28, 16 -; GCN-NEXT: v_alignbit_b32 v50, v15, v30, 16 -; GCN-NEXT: v_alignbit_b32 v48, v14, v48, 16 -; GCN-NEXT: v_alignbit_b32 v38, v13, v38, 16 -; GCN-NEXT: v_alignbit_b32 v23, v12, v23, 16 -; GCN-NEXT: v_alignbit_b32 v22, v11, v22, 16 -; GCN-NEXT: v_alignbit_b32 v20, v10, v20, 16 -; GCN-NEXT: v_alignbit_b32 v18, v9, v18, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v55 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v31 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v54 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v52 -; GCN-NEXT: v_or_b32_e32 v24, v24, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v26, v26, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v29, v29, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v49, v49, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v35, v8 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v39, v39, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v37, v37, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v6, v14, v6 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v36, v36, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v5, v13, v5 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v21, v21, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v4, v12, v4 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 -; GCN-NEXT: v_or_b32_e32 v17, v17, v20 -; GCN-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-NEXT: v_or_b32_e32 v10, v16, v18 -; GCN-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40f16_to_v40i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v30 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v44 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v56 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v59 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v23, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 +; SI-NEXT: v_or_b32_e32 v50, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v54 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v49, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v53 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v36, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v52 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v34, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v32, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_or_b32_e32 v9, v9, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v14, v14, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_or_b32_e32 v12, v12, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v17 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v16 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_or_b32_e32 v35, v30, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v33, v33, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v37 +; SI-NEXT: v_or_b32_e32 v48, v30, v39 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v38 +; SI-NEXT: v_or_b32_e32 v39, v20, v30 +; SI-NEXT: v_or_b32_e32 v15, v15, v22 +; SI-NEXT: v_or_b32_e32 v13, v13, v21 +; SI-NEXT: v_or_b32_e32 v11, v11, v28 +; SI-NEXT: v_or_b32_e32 v6, v6, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_alignbit_b32 v40, v39, v23, 16 +; SI-NEXT: v_alignbit_b32 v55, v48, v24, 16 +; SI-NEXT: v_alignbit_b32 v54, v33, v25, 16 +; SI-NEXT: v_alignbit_b32 v53, v35, v26, 16 +; SI-NEXT: v_alignbit_b32 v52, v18, v27, 16 +; SI-NEXT: v_alignbit_b32 v51, v12, v22, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, v9, v28, 16 +; SI-NEXT: v_alignbit_b32 v20, v3, v19, 16 +; SI-NEXT: v_alignbit_b32 v19, v5, v29, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v38 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v54 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v31 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v53 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v23, v16 +; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v16, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_or_b32_e32 v16, v16, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v16, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40f16_to_v40i16: ; VI: ; %bb.0: @@ -15553,7 +33383,7 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_add_f16_e32 v39, 0x200, v39 @@ -15595,7 +33425,7 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 ; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -15666,7 +33496,7 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v19, v39, v19, s6 @@ -15730,7 +33560,7 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v19 -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v20, v0, s4 @@ -15763,7 +33593,7 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] @@ -15785,7 +33615,7 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15817,7 +33647,7 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v18, v39, v18, 0x5040100 @@ -15879,7 +33709,7 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v18 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: .LBB58_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v21, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v22, v1, 0x5040100 @@ -15918,3 +33748,1035 @@ end: %phi = phi <40 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <40 x i16> %phi } + +define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40f16_to_v40i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v27, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v48 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v39 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v34, v25, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_or_b32_e32 v12, v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v28, v28, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v33, v25, v33 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v35, v35, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_or_b32_e32 v20, v20, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v25 +; SI-NEXT: v_or_b32_e32 v22, v22, v26 +; SI-NEXT: v_or_b32_e32 v21, v21, v27 +; SI-NEXT: v_or_b32_e32 v16, v16, v24 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_or_b32_e32 v30, v30, v38 +; SI-NEXT: v_or_b32_e32 v29, v29, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v51 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v52 +; SI-NEXT: v_alignbit_b32 v49, v19, v26, 16 +; SI-NEXT: v_alignbit_b32 v26, v20, v27, 16 +; SI-NEXT: v_alignbit_b32 v25, v14, v24, 16 +; SI-NEXT: v_alignbit_b32 v24, v35, v48, 16 +; SI-NEXT: v_alignbit_b32 v48, v33, v50, 16 +; SI-NEXT: v_alignbit_b32 v39, v28, v38, 16 +; SI-NEXT: v_alignbit_b32 v38, v12, v37, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v51, 16 +; SI-NEXT: v_alignbit_b32 v36, v3, v23, 16 +; SI-NEXT: v_alignbit_b32 v23, v5, v52, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v22, v22, v27 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v24 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v40f16_to_v40i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v26, 0x200 +; VI-NEXT: v_add_f16_e32 v24, s16, v26 +; VI-NEXT: v_add_f16_e32 v39, s43, v26 +; VI-NEXT: v_add_f16_e32 v25, s17, v26 +; VI-NEXT: v_add_f16_e32 v38, s42, v26 +; VI-NEXT: v_add_f16_e32 v20, s18, v26 +; VI-NEXT: v_add_f16_e32 v37, s41, v26 +; VI-NEXT: v_add_f16_e32 v21, s19, v26 +; VI-NEXT: v_add_f16_e32 v36, s40, v26 +; VI-NEXT: v_add_f16_e32 v22, s20, v26 +; VI-NEXT: v_add_f16_e32 v35, s15, v26 +; VI-NEXT: v_add_f16_e32 v23, s21, v26 +; VI-NEXT: v_add_f16_e32 v34, s14, v26 +; VI-NEXT: v_add_f16_e32 v6, s22, v26 +; VI-NEXT: v_add_f16_e32 v33, s13, v26 +; VI-NEXT: v_add_f16_e32 v7, s23, v26 +; VI-NEXT: v_add_f16_e32 v32, s12, v26 +; VI-NEXT: v_add_f16_e32 v8, s24, v26 +; VI-NEXT: v_add_f16_e32 v31, s11, v26 +; VI-NEXT: v_add_f16_e32 v9, s25, v26 +; VI-NEXT: v_add_f16_e32 v30, s10, v26 +; VI-NEXT: v_add_f16_e32 v10, s26, v26 +; VI-NEXT: v_add_f16_e32 v29, s9, v26 +; VI-NEXT: v_add_f16_e32 v11, s27, v26 +; VI-NEXT: v_add_f16_e32 v28, s8, v26 +; VI-NEXT: v_add_f16_e32 v12, s28, v26 +; VI-NEXT: v_add_f16_e32 v27, s7, v26 +; VI-NEXT: v_add_f16_e32 v13, s29, v26 +; VI-NEXT: v_add_f16_e32 v26, s6, v26 +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: s_branch .LBB59_5 +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v26, s6 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v27, s7 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v28, s8 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v29, s9 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v30, s10 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v31, s11 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v32, s12 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v33, s13 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v34, s14 +; VI-NEXT: v_mov_b32_e32 v23, s21 +; VI-NEXT: v_mov_b32_e32 v35, s15 +; VI-NEXT: v_mov_b32_e32 v22, s20 +; VI-NEXT: v_mov_b32_e32 v36, s40 +; VI-NEXT: v_mov_b32_e32 v21, s19 +; VI-NEXT: v_mov_b32_e32 v37, s41 +; VI-NEXT: v_mov_b32_e32 v20, s18 +; VI-NEXT: v_mov_b32_e32 v38, s42 +; VI-NEXT: v_mov_b32_e32 v25, s17 +; VI-NEXT: v_mov_b32_e32 v39, s43 +; VI-NEXT: v_mov_b32_e32 v24, s16 +; VI-NEXT: .LBB59_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: v_or_b32_sdwa v24, v24, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v20 +; VI-NEXT: v_mov_b32_e32 v3, v21 +; VI-NEXT: v_mov_b32_e32 v4, v22 +; VI-NEXT: v_mov_b32_e32 v5, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v40f16_to_v40i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_mov_b32_e32 v14, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_f16 v12, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_f16 v11, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_f16 v10, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_f16 v9, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_f16 v8, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_f16 v7, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_f16 v6, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_f16 v23, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_f16 v22, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_f16 v21, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_pk_add_f16 v20, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_pk_add_f16 v25, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_f16 v24, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: s_branch .LBB59_5 +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v23, s21 +; GFX9-NEXT: v_mov_b32_e32 v22, s20 +; GFX9-NEXT: v_mov_b32_e32 v21, s19 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_mov_b32_e32 v25, s17 +; GFX9-NEXT: v_mov_b32_e32 v24, s16 +; GFX9-NEXT: v_mov_b32_e32 v26, s43 +; GFX9-NEXT: v_mov_b32_e32 v27, s42 +; GFX9-NEXT: v_mov_b32_e32 v28, s41 +; GFX9-NEXT: v_mov_b32_e32 v29, s40 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v31, s14 +; GFX9-NEXT: v_mov_b32_e32 v32, s13 +; GFX9-NEXT: v_mov_b32_e32 v33, s12 +; GFX9-NEXT: v_mov_b32_e32 v34, s11 +; GFX9-NEXT: v_mov_b32_e32 v35, s10 +; GFX9-NEXT: v_mov_b32_e32 v36, s9 +; GFX9-NEXT: v_mov_b32_e32 v37, s8 +; GFX9-NEXT: v_mov_b32_e32 v38, s7 +; GFX9-NEXT: v_mov_b32_e32 v39, s6 +; GFX9-NEXT: .LBB59_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v35, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v34, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v32, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v31, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v30, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v29, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v27, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v26, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-NEXT: v_mov_b32_e32 v3, v21 +; GFX9-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v40i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s14, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s13, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s12, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s11, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s10, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v38.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v39.l +; GFX11-TRUE16-NEXT: s_branch .LBB59_5 +; GFX11-TRUE16-NEXT: .LBB59_3: +; GFX11-TRUE16-NEXT: s_branch .LBB59_2 +; GFX11-TRUE16-NEXT: .LBB59_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s6 +; GFX11-TRUE16-NEXT: .LBB59_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v28, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v29, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v23, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v24, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v25, 16, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v35, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v36, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v37, 16, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v30, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v27, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v22, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v24, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v25, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v21 :: v_dual_mov_b32 v0, v20 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v40i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: s_branch .LBB59_5 +; GFX11-FAKE16-NEXT: .LBB59_3: +; GFX11-FAKE16-NEXT: s_branch .LBB59_2 +; GFX11-FAKE16-NEXT: .LBB59_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v10, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s25 :: v_dual_mov_b32 v12, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s23 :: v_dual_mov_b32 v14, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v6, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v2, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v21, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s43 :: v_dual_mov_b32 v25, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v27, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v33, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v35, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s5 +; GFX11-FAKE16-NEXT: .LBB59_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v36, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v31, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v30, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v25, 16, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v28, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v26, 16, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v22, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v21 :: v_dual_mov_b32 v0, v20 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <40 x half> %a, splat (half 0xH0200) + %a2 = bitcast <40 x half> %a1 to <40 x i16> + br label %end + +cmp.false: + %a3 = bitcast <40 x half> %a to <40 x i16> + br label %end + +end: + %phi = phi <40 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i16> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index 8fa9b3c46ae93..abb312899114e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -1,26 +1,25 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define double @bitcast_i64_to_f64(i64 %a, i32 %b) { -; GCN-LABEL: bitcast_i64_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i64_to_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i64_to_f64: ; VI: ; %bb.0: @@ -82,20 +81,106 @@ end: ret double %phi } +define inreg double @bitcast_i64_to_f64_scalar(i64 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i64_to_f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_i64_to_f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_i64_to_f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_i64_to_f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to double + br label %end + +cmp.false: + %a3 = bitcast i64 %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + define i64 @bitcast_f64_to_i64(double %a, i32 %b) { -; GCN-LABEL: bitcast_f64_to_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f64_to_i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_i64: ; VI: ; %bb.0: @@ -131,10 +216,10 @@ define i64 @bitcast_f64_to_i64(double %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -154,21 +239,106 @@ end: ret i64 %phi } +define inreg i64 @bitcast_f64_to_i64_scalar(double inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f64_to_i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f64_to_i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast double %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + define <2 x i32> @bitcast_i64_to_v2i32(i64 %a, i32 %b) { -; GCN-LABEL: bitcast_i64_to_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i64_to_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i64_to_v2i32: ; VI: ; %bb.0: @@ -230,21 +400,107 @@ end: ret <2 x i32> %phi } +define inreg <2 x i32> @bitcast_i64_to_v2i32_scalar(i64 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i64_to_v2i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_i64_to_v2i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_i64_to_v2i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_i64_to_v2i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + define i64 @bitcast_v2i32_to_i64(<2 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i32_to_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i32_to_i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_i64: ; VI: ; %bb.0: @@ -305,21 +561,107 @@ end: ret i64 %phi } +define inreg i64 @bitcast_v2i32_to_i64_scalar(<2 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i32_to_i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v2i32_to_i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v2i32_to_i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v2i32_to_i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_3 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB7_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: s_branch .LBB7_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + define <2 x float> @bitcast_i64_to_v2f32(i64 %a, i32 %b) { -; GCN-LABEL: bitcast_i64_to_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i64_to_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i64_to_v2f32: ; VI: ; %bb.0: @@ -381,21 +723,107 @@ end: ret <2 x float> %phi } +define inreg <2 x float> @bitcast_i64_to_v2f32_scalar(i64 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i64_to_v2f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_i64_to_v2f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_i64_to_v2f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_i64_to_v2f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + define i64 @bitcast_v2f32_to_i64(<2 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f32_to_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f32_to_i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_i64: ; VI: ; %bb.0: @@ -455,32 +883,120 @@ end: ret i64 %phi } +define inreg i64 @bitcast_v2f32_to_i64_scalar(<2 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f32_to_i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_4 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_3: +; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + define <4 x i16> @bitcast_i64_to_v4i16(i64 %a, i32 %b) { -; GCN-LABEL: bitcast_i64_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v4 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i64_to_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i64_to_v4i16: ; VI: ; %bb.0: @@ -542,46 +1058,142 @@ end: ret <4 x i16> %phi } +define inreg <4 x i16> @bitcast_i64_to_v4i16_scalar(i64 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i64_to_v4i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_i64_to_v4i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_i64_to_v4i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_i64_to_v4i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + define i64 @bitcast_v4i16_to_i64(<4 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i16_to_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i16_to_i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_i64: ; VI: ; %bb.0: @@ -647,81 +1259,192 @@ end: ret i64 %phi } -define <4 x half> @bitcast_i64_to_v4f16(i64 %a, i32 %b) { -; GCN-LABEL: bitcast_i64_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg i64 @bitcast_v4i16_to_i64_scalar(<4 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i16_to_i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_branch .LBB15_2 ; -; VI-LABEL: bitcast_i64_to_v4f16: +; VI-LABEL: bitcast_v4i16_to_i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 ; -; GFX9-LABEL: bitcast_i64_to_v4f16: +; GFX9-LABEL: bitcast_v4i16_to_i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_i64_to_v4f16: +; GFX11-LABEL: bitcast_v4i16_to_i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + +define <4 x half> @bitcast_i64_to_v4f16(i64 %a, i32 %b) { +; SI-LABEL: bitcast_i64_to_v4f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i64_to_v4f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i64_to_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_i64_to_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -750,55 +1473,156 @@ end: ret <4 x half> %phi } +define inreg <4 x half> @bitcast_i64_to_v4f16_scalar(i64 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i64_to_v4f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_i64_to_v4f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_i64_to_v4f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_i64_to_v4f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + define i64 @bitcast_v4f16_to_i64(<4 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f16_to_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f16_to_i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_i64: ; VI: ; %bb.0: @@ -865,44 +1689,164 @@ end: ret i64 %phi } +define inreg i64 @bitcast_v4f16_to_i64_scalar(<4 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f16_to_i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v4f16_to_i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + define <4 x bfloat> @bitcast_i64_to_v4bf16(i64 %a, i32 %b) { -; GCN-LABEL: bitcast_i64_to_v4bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i64_to_v4bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i64_to_v4bf16: ; VI: ; %bb.0: @@ -964,51 +1908,152 @@ end: ret <4 x bfloat> %phi } +define inreg <4 x bfloat> @bitcast_i64_to_v4bf16_scalar(i64 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i64_to_v4bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_and_b32 s8, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_and_b32 s6, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s5, 16 +; SI-NEXT: s_and_b32 s8, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s4, 16 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_i64_to_v4bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_i64_to_v4bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_i64_to_v4bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_3 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB21_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: s_branch .LBB21_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v4bf16_to_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_4 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB11_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: .LBB11_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4bf16_to_i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_4 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB22_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: .LBB22_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_i64: ; VI: ; %bb.0: @@ -1017,7 +2062,7 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -1056,7 +2101,7 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1067,7 +2112,7 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -1101,7 +2146,7 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s7 -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1113,7 +2158,7 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -1155,7 +2200,7 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 -; GFX11-TRUE16-NEXT: .LBB11_2: ; %end +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1167,7 +2212,7 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1202,7 +2247,7 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB11_2: ; %end +; GFX11-FAKE16-NEXT: .LBB22_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1222,47 +2267,330 @@ end: ret i64 %phi } +define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4bf16_to_i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v4bf16_to_i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v2 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v1 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-TRUE16-NEXT: .LBB23_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s1, 16 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v4, v10 :: v_dual_add_nc_u32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v5, v6 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB23_3: +; GFX11-TRUE16-NEXT: s_branch .LBB23_2 +; GFX11-TRUE16-NEXT: .LBB23_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-FAKE16-NEXT: .LBB23_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB23_3: +; GFX11-FAKE16-NEXT: s_branch .LBB23_2 +; GFX11-FAKE16-NEXT: .LBB23_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + define <8 x i8> @bitcast_i64_to_v8i8(i64 %a, i32 %b) { -; GCN-LABEL: bitcast_i64_to_v8i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB12_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB12_4 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB12_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: .LBB12_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i64_to_v8i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB24_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB24_4 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB24_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: .LBB24_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i64_to_v8i8: ; VI: ; %bb.0: @@ -1400,7 +2728,7 @@ define <8 x i8> @bitcast_i64_to_v8i8(i64 %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1412,7 +2740,7 @@ define <8 x i8> @bitcast_i64_to_v8i8(i64 %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 -; GFX11-FAKE16-NEXT: .LBB12_4: ; %end +; GFX11-FAKE16-NEXT: .LBB24_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 @@ -1434,97 +2762,319 @@ end: ret <8 x i8> %phi } -define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i8_to_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_4 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB13_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: .LBB13_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i64_to_v8i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB25_2 ; -; VI-LABEL: bitcast_v8i8_to_i64: +; VI-LABEL: bitcast_i64_to_v8i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: v_lshlrev_b16_e32 v10, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v8, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s5, s17, 24 +; VI-NEXT: s_lshr_b32 s8, s17, 16 +; VI-NEXT: s_lshr_b32 s9, s17, 8 +; VI-NEXT: s_lshr_b32 s10, s16, 16 +; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s5, s17, 24 +; VI-NEXT: s_lshr_b32 s8, s17, 16 +; VI-NEXT: s_lshr_b32 s9, s17, 8 +; VI-NEXT: s_lshr_b32 s10, s16, 16 +; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_mov_b32_e32 v6, s8 +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_i64_to_v8i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s5, s17, 24 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s9, s17, 8 +; GFX9-NEXT: s_lshr_b32 s10, s16, 16 +; GFX9-NEXT: s_lshr_b32 s11, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s5, s17, 24 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s9, s17, 8 +; GFX9-NEXT: s_lshr_b32 s10, s16, 16 +; GFX9-NEXT: s_lshr_b32 s11, s16, 8 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr9 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr5 +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-TRUE16-LABEL: bitcast_i64_to_v8i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB25_3 +; GFX11-TRUE16-NEXT: .LBB25_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-TRUE16-NEXT: .LBB25_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB25_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB25_2 +; +; GFX11-FAKE16-LABEL: bitcast_i64_to_v8i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB25_3 +; GFX11-FAKE16-NEXT: .LBB25_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-FAKE16-NEXT: .LBB25_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB25_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: s_branch .LBB25_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + +define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { +; SI-LABEL: bitcast_v8i8_to_i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v7 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_4 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB26_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: .LBB26_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i8_to_i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB26_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB13_4 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB26_4 +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB13_3: ; %cmp.false +; VI-NEXT: .LBB26_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -1540,8 +3090,8 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr6 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 -; VI-NEXT: .LBB13_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: .LBB26_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v9 ; VI-NEXT: v_add_u16_e32 v1, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1572,14 +3122,14 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: s_cbranch_execnz .LBB26_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB13_4 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB26_4 +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB13_3: ; %cmp.false +; GFX9-NEXT: .LBB26_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -1595,8 +3145,8 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr6 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 -; GFX9-NEXT: .LBB13_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: .LBB26_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1629,14 +3179,14 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-TRUE16-NEXT: .LBB13_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_4 +; GFX11-TRUE16-NEXT: .LBB26_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB13_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l @@ -1665,8 +3215,8 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-TRUE16-NEXT: .LBB13_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 @@ -1714,14 +3264,14 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-FAKE16-NEXT: .LBB13_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_4 +; GFX11-FAKE16-NEXT: .LBB26_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB13_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -1750,8 +3300,8 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-FAKE16-NEXT: .LBB13_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v9, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v2, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -1802,20 +3352,278 @@ end: ret i64 %phi } +define inreg i64 @bitcast_v8i8_to_i64_scalar(<8 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i8_to_i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v8i8_to_i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: s_and_b32 s8, s16, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v8i8_to_i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v8i8_to_i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_3 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s3, 8 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s17, 8 +; GFX11-NEXT: s_and_b32 s4, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s19, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_or_b32 s5, s2, s3 +; GFX11-NEXT: .LBB27_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: s_branch .LBB27_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + define <2 x i32> @bitcast_f64_to_v2i32(double %a, i32 %b) { -; GCN-LABEL: bitcast_f64_to_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB14_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f64_to_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v2i32: ; VI: ; %bb.0: @@ -1851,10 +3659,10 @@ define <2 x i32> @bitcast_f64_to_v2i32(double %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB14_2: ; %end +; GFX11-NEXT: .LBB28_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1874,21 +3682,106 @@ end: ret <2 x i32> %phi } +define inreg <2 x i32> @bitcast_f64_to_v2i32_scalar(double inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f64_to_v2i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB29_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB29_4 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_3: +; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f64_to_v2i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v2i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_v2i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast double %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + define double @bitcast_v2i32_to_f64(<2 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i32_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i32_to_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_f64: ; VI: ; %bb.0: @@ -1949,20 +3842,106 @@ end: ret double %phi } +define inreg double @bitcast_v2i32_to_f64_scalar(<2 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i32_to_f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v2i32_to_f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v2i32_to_f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-LABEL: bitcast_v2i32_to_f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB31_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: s_branch .LBB31_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + define <2 x float> @bitcast_f64_to_v2f32(double %a, i32 %b) { -; GCN-LABEL: bitcast_f64_to_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f64_to_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v2f32: ; VI: ; %bb.0: @@ -1998,10 +3977,10 @@ define <2 x float> @bitcast_f64_to_v2f32(double %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB16_2: ; %end +; GFX11-NEXT: .LBB32_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2021,21 +4000,106 @@ end: ret <2 x float> %phi } +define inreg <2 x float> @bitcast_f64_to_v2f32_scalar(double inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f64_to_v2f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB33_4 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_3: +; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f64_to_v2f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_3: +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v2f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_v2f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast double %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + define double @bitcast_v2f32_to_f64(<2 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f32_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f32_to_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_f64: ; VI: ; %bb.0: @@ -2095,33 +4159,121 @@ end: ret double %phi } +define inreg double @bitcast_v2f32_to_f64_scalar(<2 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f32_to_f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB35_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB35_4 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_3: +; SI-NEXT: s_branch .LBB35_2 +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB35_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: +; VI-NEXT: s_branch .LBB35_2 +; VI-NEXT: .LBB35_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: +; GFX9-NEXT: s_branch .LBB35_2 +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + define <4 x i16> @bitcast_f64_to_v4i16(double %a, i32 %b) { -; GCN-LABEL: bitcast_f64_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v1, v5, v4, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NEXT: .LBB18_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_alignbit_b32 v1, v5, v4, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NEXT: .LBB18_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v4 -; GCN-NEXT: v_mov_b32_e32 v2, v5 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f64_to_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v1, v5, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_alignbit_b32 v1, v5, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v4 +; SI-NEXT: v_mov_b32_e32 v2, v5 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v4i16: ; VI: ; %bb.0: @@ -2157,10 +4309,10 @@ define <4 x i16> @bitcast_f64_to_v4i16(double %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB18_2: ; %end +; GFX11-NEXT: .LBB36_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2180,46 +4332,143 @@ end: ret <4 x i16> %phi } +define inreg <4 x i16> @bitcast_f64_to_v4i16_scalar(double inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f64_to_v4i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB37_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB37_4 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; SI-NEXT: v_alignbit_b32 v1, v5, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: s_branch .LBB37_5 +; SI-NEXT: .LBB37_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB37_2 +; SI-NEXT: .LBB37_4: +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: .LBB37_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v4 +; SI-NEXT: v_mov_b32_e32 v2, v5 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f64_to_v4i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB37_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_4 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_3: +; VI-NEXT: s_branch .LBB37_2 +; VI-NEXT: .LBB37_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v4i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_4 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_3: +; GFX9-NEXT: s_branch .LBB37_2 +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_v4i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: s_branch .LBB37_2 +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast double %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + define double @bitcast_v4i16_to_f64(<4 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i16_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_4 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB19_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: .LBB19_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i16_to_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB38_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB38_4 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB38_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: .LBB38_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_f64: ; VI: ; %bb.0: @@ -2285,42 +4534,153 @@ end: ret double %phi } -define <4 x half> @bitcast_f64_to_v4f16(double %a, i32 %b) { -; GCN-LABEL: bitcast_f64_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v4 -; GCN-NEXT: v_mov_b32_e32 v1, v5 -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg double @bitcast_v4i16_to_f64_scalar(<4 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i16_to_f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v4i16_to_f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v4i16_to_f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_4 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_3: +; GFX9-NEXT: s_branch .LBB39_2 +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i16_to_f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: s_branch .LBB39_2 +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + +define <4 x half> @bitcast_f64_to_v4f16(double %a, i32 %b) { +; SI-LABEL: bitcast_f64_to_v4f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v4 +; SI-NEXT: v_mov_b32_e32 v1, v5 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v4f16: ; VI: ; %bb.0: @@ -2356,10 +4716,10 @@ define <4 x half> @bitcast_f64_to_v4f16(double %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: .LBB40_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2379,55 +4739,154 @@ end: ret <4 x half> %phi } +define inreg <4 x half> @bitcast_f64_to_v4f16_scalar(double inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f64_to_v4f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_f64_to_v4f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB41_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_4 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_3: +; VI-NEXT: s_branch .LBB41_2 +; VI-NEXT: .LBB41_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v4f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_4 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_3: +; GFX9-NEXT: s_branch .LBB41_2 +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_v4f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_4 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_3: +; GFX11-NEXT: s_branch .LBB41_2 +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast double %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + define double @bitcast_v4f16_to_f64(<4 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f16_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_4 -; GCN-NEXT: .LBB21_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB21_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: .LBB21_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f16_to_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_4 +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB42_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: .LBB42_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_f64: ; VI: ; %bb.0: @@ -2494,38 +4953,156 @@ end: ret double %phi } +define inreg double @bitcast_v4f16_to_f64_scalar(<4 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f16_to_f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v4f16_to_f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB43_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_4 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_3: +; VI-NEXT: s_branch .LBB43_2 +; VI-NEXT: .LBB43_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_4 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_3: +; GFX9-NEXT: s_branch .LBB43_2 +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: +; GFX11-NEXT: s_branch .LBB43_2 +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + define <4 x bfloat> @bitcast_f64_to_v4bf16(double %a, i32 %b) { -; GCN-LABEL: bitcast_f64_to_v4bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v5 -; GCN-NEXT: v_mov_b32_e32 v1, v4 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f64_to_v4bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v5 +; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v4bf16: ; VI: ; %bb.0: @@ -2561,10 +5138,10 @@ define <4 x bfloat> @bitcast_f64_to_v4bf16(double %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: .LBB44_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2584,51 +5161,151 @@ end: ret <4 x bfloat> %phi } +define inreg <4 x bfloat> @bitcast_f64_to_v4bf16_scalar(double inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f64_to_v4bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB45_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s9, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s17, 16 +; SI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB45_4 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: s_branch .LBB45_2 +; SI-NEXT: .LBB45_4: +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f64_to_v4bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB45_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_4 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_3: +; VI-NEXT: s_branch .LBB45_2 +; VI-NEXT: .LBB45_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v4bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_4 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_3: +; GFX9-NEXT: s_branch .LBB45_2 +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_v4bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_4 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_3: +; GFX11-NEXT: s_branch .LBB45_2 +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast double %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v4bf16_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_4 -; GCN-NEXT: .LBB23_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB23_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: .LBB23_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4bf16_to_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_4 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB46_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: .LBB46_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_f64: ; VI: ; %bb.0: @@ -2637,7 +5314,7 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -2676,7 +5353,7 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2687,7 +5364,7 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -2721,7 +5398,7 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s7 -; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: .LBB46_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2733,7 +5410,7 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -2775,7 +5452,7 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2787,7 +5464,7 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2822,7 +5499,7 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2842,45 +5519,326 @@ end: ret double %phi } +define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4bf16_to_f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v4bf16_to_f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v2 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v1 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s1, 16 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v4, v10 :: v_dual_add_nc_u32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v5, v6 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB47_3: +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB47_3: +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { -; GCN-LABEL: bitcast_f64_to_v8i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v1 -; GCN-NEXT: v_mov_b32_e32 v8, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v3, v9, v8, 24 -; GCN-NEXT: v_alignbit_b32 v2, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v9, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v9 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_alignbit_b32 v3, v9, v8, 24 -; GCN-NEXT: v_alignbit_b32 v2, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v9, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v9 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v8 -; GCN-NEXT: v_mov_b32_e32 v4, v9 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f64_to_v8i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v3, v9, v8, 24 +; SI-NEXT: v_alignbit_b32 v2, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v9, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_alignbit_b32 v3, v9, v8, 24 +; SI-NEXT: v_alignbit_b32 v2, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v9, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v4, v9 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v8i8: ; VI: ; %bb.0: @@ -2905,7 +5863,7 @@ define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; VI-NEXT: ; %bb.2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] @@ -2914,7 +5872,7 @@ define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v8 ; VI-NEXT: v_mov_b32_e32 v4, v9 @@ -2943,7 +5901,7 @@ define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX9-NEXT: ; %bb.2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] @@ -2952,7 +5910,7 @@ define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-NEXT: v_mov_b32_e32 v4, v9 @@ -2977,7 +5935,7 @@ define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2985,7 +5943,7 @@ define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX11-TRUE16-NEXT: .LBB24_4: ; %end +; GFX11-TRUE16-NEXT: .LBB48_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v8.h @@ -3017,7 +5975,7 @@ define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3027,7 +5985,7 @@ define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 @@ -3049,101 +6007,339 @@ end: ret <8 x i8> %phi } -define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i8_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_4 -; GCN-NEXT: .LBB25_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB25_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: .LBB25_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f64_to_v8i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB49_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s8, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB49_4 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], s[16:17], 1.0 +; SI-NEXT: v_alignbit_b32 v3, v9, v8, 24 +; SI-NEXT: v_alignbit_b32 v2, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v9, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; SI-NEXT: s_branch .LBB49_5 +; SI-NEXT: .LBB49_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB49_2 +; SI-NEXT: .LBB49_4: +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: .LBB49_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v4, v9 +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v8i8_to_f64: +; VI-LABEL: bitcast_f64_to_v8i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: v_lshlrev_b16_e32 v10, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v8, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB25_3 -; VI-NEXT: ; %bb.1: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB25_4 -; VI-NEXT: .LBB25_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB25_3: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB49_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s9, s17, 24 +; VI-NEXT: s_lshr_b32 s8, s17, 16 +; VI-NEXT: s_lshr_b32 s5, s17, 8 +; VI-NEXT: s_lshr_b32 s11, s16, 16 +; VI-NEXT: s_lshr_b32 s10, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB49_4 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], s[16:17], 1.0 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; VI-NEXT: s_branch .LBB49_5 +; VI-NEXT: .LBB49_3: +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: s_branch .LBB49_2 +; VI-NEXT: .LBB49_4: +; VI-NEXT: v_mov_b32_e32 v8, s16 +; VI-NEXT: v_mov_b32_e32 v9, s17 +; VI-NEXT: v_mov_b32_e32 v2, s11 +; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v7, s9 +; VI-NEXT: v_mov_b32_e32 v6, s8 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .LBB49_5: ; %end +; VI-NEXT: v_mov_b32_e32 v0, v8 +; VI-NEXT: v_mov_b32_e32 v4, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v8i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s9, s17, 24 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s5, s17, 8 +; GFX9-NEXT: s_lshr_b32 s11, s16, 16 +; GFX9-NEXT: s_lshr_b32 s10, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], s[16:17], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: s_branch .LBB49_5 +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr5 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr9 +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB49_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_f64_to_v8i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[3:4], s[0:1], 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB49_5 +; GFX11-TRUE16-NEXT: .LBB49_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s6 +; GFX11-TRUE16-NEXT: .LBB49_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_f64_to_v8i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], s[0:1], 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-FAKE16-NEXT: s_branch .LBB49_5 +; GFX11-FAKE16-NEXT: .LBB49_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v1, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v6, s5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s3 +; GFX11-FAKE16-NEXT: .LBB49_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast double %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + +define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { +; SI-LABEL: bitcast_v8i8_to_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v7 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_4 +; SI-NEXT: .LBB50_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB50_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: .LBB50_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i8_to_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB50_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB50_4 +; VI-NEXT: .LBB50_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB50_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr9 @@ -3155,8 +6351,8 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr6 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 -; VI-NEXT: .LBB25_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: .LBB50_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v9 ; VI-NEXT: v_add_u16_e32 v1, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3187,14 +6383,14 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: s_cbranch_execnz .LBB50_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB25_4 -; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB50_4 +; GFX9-NEXT: .LBB50_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB25_3: ; %cmp.false +; GFX9-NEXT: .LBB50_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3210,8 +6406,8 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr6 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 -; GFX9-NEXT: .LBB25_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB50_2 +; GFX9-NEXT: .LBB50_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3244,14 +6440,14 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_4 -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB25_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l @@ -3280,8 +6476,8 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-TRUE16-NEXT: .LBB25_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 @@ -3329,14 +6525,14 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_4 -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB25_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -3365,8 +6561,8 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-FAKE16-NEXT: .LBB25_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-FAKE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v9, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v2, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -3417,21 +6613,279 @@ end: ret double %phi } +define inreg double @bitcast_v8i8_to_f64_scalar(<8 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i8_to_f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v8i8_to_f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: s_and_b32 s8, s16, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v8i8_to_f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-LABEL: bitcast_v8i8_to_f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-NEXT: .LBB51_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s3, 8 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s17, 8 +; GFX11-NEXT: s_and_b32 s4, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s19, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_or_b32 s5, s2, s3 +; GFX11-NEXT: .LBB51_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB51_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + define <2 x float> @bitcast_v2i32_to_v2f32(<2 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i32_to_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB26_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i32_to_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_v2f32: ; VI: ; %bb.0: @@ -3492,21 +6946,107 @@ end: ret <2 x float> %phi } +define inreg <2 x float> @bitcast_v2i32_to_v2f32_scalar(<2 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i32_to_v2f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v2i32_to_v2f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v2i32_to_v2f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-LABEL: bitcast_v2i32_to_v2f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB53_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + define <2 x i32> @bitcast_v2f32_to_v2i32(<2 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f32_to_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f32_to_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v2i32: ; VI: ; %bb.0: @@ -3566,32 +7106,120 @@ end: ret <2 x i32> %phi } +define inreg <2 x i32> @bitcast_v2f32_to_v2i32_scalar(<2 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f32_to_v2i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB55_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB55_4 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_3: +; SI-NEXT: s_branch .LBB55_2 +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_v2i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB55_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_4 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_3: +; VI-NEXT: s_branch .LBB55_2 +; VI-NEXT: .LBB55_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_v2i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: +; GFX9-NEXT: s_branch .LBB55_2 +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_v2i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + define <4 x i16> @bitcast_v2i32_to_v4i16(<2 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i32_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v4 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i32_to_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_v4i16: ; VI: ; %bb.0: @@ -3652,46 +7280,142 @@ end: ret <4 x i16> %phi } +define inreg <4 x i16> @bitcast_v2i32_to_v4i16_scalar(<2 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i32_to_v4i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v2i32_to_v4i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v2i32_to_v4i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_3 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB57_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: s_branch .LBB57_2 +; +; GFX11-LABEL: bitcast_v2i32_to_v4i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_3 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB57_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: s_branch .LBB57_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + define <2 x i32> @bitcast_v4i16_to_v2i32(<4 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i16_to_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_4 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB29_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: .LBB29_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i16_to_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_4 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB58_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: .LBB58_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_v2i32: ; VI: ; %bb.0: @@ -3757,48 +7481,159 @@ end: ret <2 x i32> %phi } +define inreg <2 x i32> @bitcast_v4i16_to_v2i32_scalar(<4 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i16_to_v2i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v4i16_to_v2i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB59_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_3 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB59_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_4: +; VI-NEXT: s_branch .LBB59_2 +; +; GFX9-LABEL: bitcast_v4i16_to_v2i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i16_to_v2i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_3: +; GFX11-NEXT: s_branch .LBB59_2 +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + define <4 x half> @bitcast_v2i32_to_v4f16(<2 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i32_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB30_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB30_4 -; GCN-NEXT: .LBB30_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB30_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB30_2 -; GCN-NEXT: .LBB30_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i32_to_v4f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_4 +; SI-NEXT: .LBB60_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB60_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: .LBB60_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_v4f16: ; VI: ; %bb.0: @@ -3859,55 +7694,156 @@ end: ret <4 x half> %phi } +define inreg <4 x half> @bitcast_v2i32_to_v4f16_scalar(<2 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i32_to_v4f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB61_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB61_3 +; SI-NEXT: .LBB61_2: ; %cmp.true +; SI-NEXT: s_add_i32 s4, s16, 3 +; SI-NEXT: s_add_i32 s6, s17, 3 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB61_2 +; +; VI-LABEL: bitcast_v2i32_to_v4f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB61_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB61_3 +; VI-NEXT: .LBB61_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB61_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB61_4: +; VI-NEXT: s_branch .LBB61_2 +; +; GFX9-LABEL: bitcast_v2i32_to_v4f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB61_3 +; GFX9-NEXT: .LBB61_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB61_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB61_4: +; GFX9-NEXT: s_branch .LBB61_2 +; +; GFX11-LABEL: bitcast_v2i32_to_v4f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB61_3 +; GFX11-NEXT: .LBB61_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB61_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB61_4: +; GFX11-NEXT: s_branch .LBB61_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + define <2 x i32> @bitcast_v4f16_to_v2i32(<4 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f16_to_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB31_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB31_4 -; GCN-NEXT: .LBB31_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB31_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB31_2 -; GCN-NEXT: .LBB31_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f16_to_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB62_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB62_4 +; SI-NEXT: .LBB62_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB62_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_2 +; SI-NEXT: .LBB62_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v2i32: ; VI: ; %bb.0: @@ -3974,44 +7910,164 @@ end: ret <2 x i32> %phi } +define inreg <2 x i32> @bitcast_v4f16_to_v2i32_scalar(<4 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f16_to_v2i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: .LBB63_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB63_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_branch .LBB63_2 +; +; VI-LABEL: bitcast_v4f16_to_v2i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB63_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB63_4 +; VI-NEXT: .LBB63_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB63_3: +; VI-NEXT: s_branch .LBB63_2 +; VI-NEXT: .LBB63_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_v2i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB63_4 +; GFX9-NEXT: .LBB63_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB63_3: +; GFX9-NEXT: s_branch .LBB63_2 +; GFX9-NEXT: .LBB63_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_v2i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB63_4 +; GFX11-NEXT: .LBB63_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB63_3: +; GFX11-NEXT: s_branch .LBB63_2 +; GFX11-NEXT: .LBB63_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + define <4 x bfloat> @bitcast_v2i32_to_v4bf16(<2 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i32_to_v4bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_4 -; GCN-NEXT: .LBB32_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB32_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB32_2 -; GCN-NEXT: .LBB32_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i32_to_v4bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_4 +; SI-NEXT: .LBB64_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB64_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_2 +; SI-NEXT: .LBB64_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_v4bf16: ; VI: ; %bb.0: @@ -4072,51 +8128,152 @@ end: ret <4 x bfloat> %phi } +define inreg <4 x bfloat> @bitcast_v2i32_to_v4bf16_scalar(<2 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i32_to_v4bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_and_b32 s8, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: .LBB65_2: ; %cmp.true +; SI-NEXT: s_add_i32 s4, s16, 3 +; SI-NEXT: s_add_i32 s5, s17, 3 +; SI-NEXT: s_and_b32 s6, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s5, 16 +; SI-NEXT: s_and_b32 s8, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s4, 16 +; SI-NEXT: .LBB65_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB65_4: +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB65_2 +; +; VI-LABEL: bitcast_v2i32_to_v4bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB65_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB65_3 +; VI-NEXT: .LBB65_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB65_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB65_4: +; VI-NEXT: s_branch .LBB65_2 +; +; GFX9-LABEL: bitcast_v2i32_to_v4bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB65_3 +; GFX9-NEXT: .LBB65_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB65_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB65_4: +; GFX9-NEXT: s_branch .LBB65_2 +; +; GFX11-LABEL: bitcast_v2i32_to_v4bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB65_3 +; GFX11-NEXT: .LBB65_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB65_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB65_4: +; GFX11-NEXT: s_branch .LBB65_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v4bf16_to_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB33_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB33_4 -; GCN-NEXT: .LBB33_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB33_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB33_2 -; GCN-NEXT: .LBB33_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4bf16_to_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB66_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB66_4 +; SI-NEXT: .LBB66_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB66_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_2 +; SI-NEXT: .LBB66_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_v2i32: ; VI: ; %bb.0: @@ -4125,7 +8282,7 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB33_2 +; VI-NEXT: s_cbranch_execz .LBB66_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -4164,7 +8321,7 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: .LBB33_2: ; %end +; VI-NEXT: .LBB66_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4175,7 +8332,7 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB33_2 +; GFX9-NEXT: s_cbranch_execz .LBB66_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -4209,7 +8366,7 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s7 -; GFX9-NEXT: .LBB33_2: ; %end +; GFX9-NEXT: .LBB66_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4221,7 +8378,7 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -4244,74 +8401,357 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 +; GFX11-TRUE16-NEXT: .LBB66_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB66_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-FAKE16-NEXT: .LBB66_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + +define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4bf16_to_v2i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_cbranch_scc0 .LBB67_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: s_cbranch_execnz .LBB67_3 +; SI-NEXT: .LBB67_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB67_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_branch .LBB67_2 +; +; VI-LABEL: bitcast_v4bf16_to_v2i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB67_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB67_4 +; VI-NEXT: .LBB67_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB67_3: +; VI-NEXT: s_branch .LBB67_2 +; VI-NEXT: .LBB67_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_v2i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB67_4 +; GFX9-NEXT: .LBB67_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v2 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v1 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB67_3: +; GFX9-NEXT: s_branch .LBB67_2 +; GFX9-NEXT: .LBB67_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v2i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB67_4 +; GFX11-TRUE16-NEXT: .LBB67_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s1, 16 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v4, v10 :: v_dual_add_nc_u32 v7, 0x7fff, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v5, v6 :: v_dual_and_b32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 -; GFX11-TRUE16-NEXT: .LBB33_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB67_3: +; GFX11-TRUE16-NEXT: s_branch .LBB67_2 +; GFX11-TRUE16-NEXT: .LBB67_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2i32: +; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2i32_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v2 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB33_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0 -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB67_4 +; GFX11-FAKE16-NEXT: .LBB67_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB33_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB67_3: +; GFX11-FAKE16-NEXT: s_branch .LBB67_2 +; GFX11-FAKE16-NEXT: .LBB67_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4331,46 +8771,46 @@ end: } define <8 x i8> @bitcast_v2i32_to_v8i8(<2 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i32_to_v8i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB34_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB34_4 -; GCN-NEXT: .LBB34_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB34_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_2 -; GCN-NEXT: .LBB34_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i32_to_v8i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB68_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB68_4 +; SI-NEXT: .LBB68_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB68_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_2 +; SI-NEXT: .LBB68_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_v8i8: ; VI: ; %bb.0: @@ -4507,7 +8947,7 @@ define <8 x i8> @bitcast_v2i32_to_v8i8(<2 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB68_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 @@ -4518,7 +8958,7 @@ define <8 x i8> @bitcast_v2i32_to_v8i8(<2 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX11-FAKE16-NEXT: .LBB34_4: ; %end +; GFX11-FAKE16-NEXT: .LBB68_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 @@ -4540,76 +8980,298 @@ end: ret <8 x i8> %phi } +define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i32_to_v8i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: .LBB69_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: .LBB69_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB69_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB69_2 +; +; VI-LABEL: bitcast_v2i32_to_v8i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB69_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s5, s17, 24 +; VI-NEXT: s_lshr_b32 s8, s17, 16 +; VI-NEXT: s_lshr_b32 s9, s17, 8 +; VI-NEXT: s_lshr_b32 s10, s16, 16 +; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB69_3 +; VI-NEXT: .LBB69_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s5, s17, 24 +; VI-NEXT: s_lshr_b32 s8, s17, 16 +; VI-NEXT: s_lshr_b32 s9, s17, 8 +; VI-NEXT: s_lshr_b32 s10, s16, 16 +; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: .LBB69_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_mov_b32_e32 v6, s8 +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB69_4: +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: s_branch .LBB69_2 +; +; GFX9-LABEL: bitcast_v2i32_to_v8i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s5, s17, 24 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s9, s17, 8 +; GFX9-NEXT: s_lshr_b32 s10, s16, 16 +; GFX9-NEXT: s_lshr_b32 s11, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB69_3 +; GFX9-NEXT: .LBB69_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s5, s17, 24 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s9, s17, 8 +; GFX9-NEXT: s_lshr_b32 s10, s16, 16 +; GFX9-NEXT: s_lshr_b32 s11, s16, 8 +; GFX9-NEXT: .LBB69_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB69_4: +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr9 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr5 +; GFX9-NEXT: s_branch .LBB69_2 +; +; GFX11-TRUE16-LABEL: bitcast_v2i32_to_v8i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB69_3 +; GFX11-TRUE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-TRUE16-NEXT: .LBB69_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB69_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB69_2 +; +; GFX11-FAKE16-LABEL: bitcast_v2i32_to_v8i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB69_3 +; GFX11-FAKE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-FAKE16-NEXT: .LBB69_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB69_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: s_branch .LBB69_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i8_to_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB35_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB35_4 -; GCN-NEXT: .LBB35_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB35_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB35_2 -; GCN-NEXT: .LBB35_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i8_to_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v7 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_4 +; SI-NEXT: .LBB70_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB70_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_2 +; SI-NEXT: .LBB70_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i8_to_v2i32: ; VI: ; %bb.0: @@ -4623,14 +9285,14 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: s_cbranch_execnz .LBB70_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB35_4 -; VI-NEXT: .LBB35_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB70_4 +; VI-NEXT: .LBB70_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB35_3: ; %cmp.false +; VI-NEXT: .LBB70_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4646,8 +9308,8 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr6 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB35_2 -; VI-NEXT: .LBB35_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB70_2 +; VI-NEXT: .LBB70_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v9 ; VI-NEXT: v_add_u16_e32 v1, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -4678,14 +9340,14 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: s_cbranch_execnz .LBB70_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB35_4 -; GFX9-NEXT: .LBB35_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB70_4 +; GFX9-NEXT: .LBB70_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB35_3: ; %cmp.false +; GFX9-NEXT: .LBB70_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4701,8 +9363,8 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr6 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB35_2 -; GFX9-NEXT: .LBB35_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB70_2 +; GFX9-NEXT: .LBB70_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -4735,14 +9397,14 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-TRUE16-NEXT: .LBB35_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-TRUE16-NEXT: .LBB70_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l @@ -4771,8 +9433,8 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-TRUE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 @@ -4820,14 +9482,14 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-FAKE16-NEXT: .LBB35_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-FAKE16-NEXT: .LBB70_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -4856,8 +9518,8 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-FAKE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-FAKE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v9, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v2, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -4908,32 +9570,289 @@ end: ret <2 x i32> %phi } +define inreg <2 x i32> @bitcast_v8i8_to_v2i32_scalar(<8 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i8_to_v2i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB71_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB71_3 +; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB71_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_branch .LBB71_2 +; +; VI-LABEL: bitcast_v8i8_to_v2i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB71_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_cbranch_execnz .LBB71_3 +; VI-NEXT: .LBB71_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: s_and_b32 s8, s16, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: .LBB71_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB71_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; VI-NEXT: s_branch .LBB71_2 +; +; GFX9-LABEL: bitcast_v8i8_to_v2i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB71_3 +; GFX9-NEXT: .LBB71_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: .LBB71_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB71_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: s_branch .LBB71_2 +; +; GFX11-LABEL: bitcast_v8i8_to_v2i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-NEXT: .LBB71_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s3, 8 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s17, 8 +; GFX11-NEXT: s_and_b32 s4, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s19, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_or_b32 s5, s2, s3 +; GFX11-NEXT: .LBB71_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB71_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: s_branch .LBB71_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + define <4 x i16> @bitcast_v2f32_to_v4i16(<2 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f32_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: .LBB36_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: .LBB36_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v4 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f32_to_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v4i16: ; VI: ; %bb.0: @@ -4993,46 +9912,144 @@ end: ret <4 x i16> %phi } +define inreg <4 x i16> @bitcast_v2f32_to_v4i16_scalar(<2 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f32_to_v4i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB73_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB73_4 +; SI-NEXT: .LBB73_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB73_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB73_2 +; SI-NEXT: .LBB73_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_v4i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB73_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB73_4 +; VI-NEXT: .LBB73_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB73_3: +; VI-NEXT: s_branch .LBB73_2 +; VI-NEXT: .LBB73_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_v4i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB73_4 +; GFX9-NEXT: .LBB73_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB73_3: +; GFX9-NEXT: s_branch .LBB73_2 +; GFX9-NEXT: .LBB73_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_v4i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB73_4 +; GFX11-NEXT: .LBB73_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB73_3: +; GFX11-NEXT: s_branch .LBB73_2 +; GFX11-NEXT: .LBB73_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + define <2 x float> @bitcast_v4i16_to_v2f32(<4 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i16_to_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_4 -; GCN-NEXT: .LBB37_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB37_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB37_2 -; GCN-NEXT: .LBB37_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i16_to_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB74_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB74_4 +; SI-NEXT: .LBB74_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB74_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_2 +; SI-NEXT: .LBB74_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_v2f32: ; VI: ; %bb.0: @@ -5098,48 +10115,159 @@ end: ret <2 x float> %phi } +define inreg <2 x float> @bitcast_v4i16_to_v2f32_scalar(<4 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i16_to_v2f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB75_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB75_3 +; SI-NEXT: .LBB75_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: .LBB75_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB75_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_branch .LBB75_2 +; +; VI-LABEL: bitcast_v4i16_to_v2f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB75_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB75_3 +; VI-NEXT: .LBB75_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB75_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB75_4: +; VI-NEXT: s_branch .LBB75_2 +; +; GFX9-LABEL: bitcast_v4i16_to_v2f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB75_4 +; GFX9-NEXT: .LBB75_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB75_3: +; GFX9-NEXT: s_branch .LBB75_2 +; GFX9-NEXT: .LBB75_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i16_to_v2f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB75_4 +; GFX11-NEXT: .LBB75_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB75_3: +; GFX11-NEXT: s_branch .LBB75_2 +; GFX11-NEXT: .LBB75_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + define <4 x half> @bitcast_v2f32_to_v4f16(<2 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f32_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB38_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB38_4 -; GCN-NEXT: .LBB38_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB38_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_2 -; GCN-NEXT: .LBB38_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f32_to_v4f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB76_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB76_4 +; SI-NEXT: .LBB76_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB76_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_2 +; SI-NEXT: .LBB76_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v4f16: ; VI: ; %bb.0: @@ -5155,32 +10283,135 @@ define <4 x half> @bitcast_v2f32_to_v4f16(<2 x float> %a, i32 %b) { ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v2f32_to_v4f16: +; GFX9-LABEL: bitcast_v2f32_to_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + +define inreg <4 x half> @bitcast_v2f32_to_v4f16_scalar(<2 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f32_to_v4f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: .LBB77_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB77_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB77_2 +; +; VI-LABEL: bitcast_v2f32_to_v4f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB77_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB77_4 +; VI-NEXT: .LBB77_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB77_3: +; VI-NEXT: s_branch .LBB77_2 +; VI-NEXT: .LBB77_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_v4f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB77_4 +; GFX9-NEXT: .LBB77_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB77_3: +; GFX9-NEXT: s_branch .LBB77_2 +; GFX9-NEXT: .LBB77_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v2f32_to_v4f16: +; GFX11-LABEL: bitcast_v2f32_to_v4f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB77_4 +; GFX11-NEXT: .LBB77_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB77_3: +; GFX11-NEXT: s_branch .LBB77_2 +; GFX11-NEXT: .LBB77_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -5200,54 +10431,54 @@ end: } define <2 x float> @bitcast_v4f16_to_v2f32(<4 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f16_to_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB39_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB39_4 -; GCN-NEXT: .LBB39_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB39_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB39_2 -; GCN-NEXT: .LBB39_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f16_to_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB78_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB78_4 +; SI-NEXT: .LBB78_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB78_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_2 +; SI-NEXT: .LBB78_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v2f32: ; VI: ; %bb.0: @@ -5314,44 +10545,164 @@ end: ret <2 x float> %phi } +define inreg <2 x float> @bitcast_v4f16_to_v2f32_scalar(<4 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f16_to_v2f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: .LBB79_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB79_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_branch .LBB79_2 +; +; VI-LABEL: bitcast_v4f16_to_v2f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB79_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB79_4 +; VI-NEXT: .LBB79_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB79_3: +; VI-NEXT: s_branch .LBB79_2 +; VI-NEXT: .LBB79_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_v2f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB79_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB79_4 +; GFX9-NEXT: .LBB79_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB79_3: +; GFX9-NEXT: s_branch .LBB79_2 +; GFX9-NEXT: .LBB79_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_v2f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB79_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB79_4 +; GFX11-NEXT: .LBB79_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB79_3: +; GFX11-NEXT: s_branch .LBB79_2 +; GFX11-NEXT: .LBB79_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + define <4 x bfloat> @bitcast_v2f32_to_v4bf16(<2 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f32_to_v4bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB40_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB40_4 -; GCN-NEXT: .LBB40_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB40_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_2 -; GCN-NEXT: .LBB40_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f32_to_v4bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB80_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB80_4 +; SI-NEXT: .LBB80_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB80_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_2 +; SI-NEXT: .LBB80_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v4bf16: ; VI: ; %bb.0: @@ -5411,51 +10762,155 @@ end: ret <4 x bfloat> %phi } +define inreg <4 x bfloat> @bitcast_v2f32_to_v4bf16_scalar(<2 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f32_to_v4bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB81_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_and_b32 s8, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB81_4 +; SI-NEXT: .LBB81_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB81_3: +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB81_2 +; SI-NEXT: .LBB81_4: +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_v4bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB81_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB81_4 +; VI-NEXT: .LBB81_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB81_3: +; VI-NEXT: s_branch .LBB81_2 +; VI-NEXT: .LBB81_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_v4bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB81_4 +; GFX9-NEXT: .LBB81_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB81_3: +; GFX9-NEXT: s_branch .LBB81_2 +; GFX9-NEXT: .LBB81_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_v4bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB81_4 +; GFX11-NEXT: .LBB81_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB81_3: +; GFX11-NEXT: s_branch .LBB81_2 +; GFX11-NEXT: .LBB81_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v4bf16_to_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB41_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB41_4 -; GCN-NEXT: .LBB41_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB41_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB41_2 -; GCN-NEXT: .LBB41_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4bf16_to_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB82_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB82_4 +; SI-NEXT: .LBB82_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB82_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_2 +; SI-NEXT: .LBB82_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_v2f32: ; VI: ; %bb.0: @@ -5464,7 +10919,7 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB41_2 +; VI-NEXT: s_cbranch_execz .LBB82_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -5503,7 +10958,7 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5514,7 +10969,7 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB41_2 +; GFX9-NEXT: s_cbranch_execz .LBB82_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -5548,7 +11003,7 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s7 -; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: .LBB82_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5560,7 +11015,7 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -5602,7 +11057,7 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 -; GFX11-TRUE16-NEXT: .LBB41_2: ; %end +; GFX11-TRUE16-NEXT: .LBB82_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5614,7 +11069,7 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5649,7 +11104,7 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB41_2: ; %end +; GFX11-FAKE16-NEXT: .LBB82_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5669,47 +11124,330 @@ end: ret <2 x float> %phi } +define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4bf16_to_v2f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_cbranch_scc0 .LBB83_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: s_cbranch_execnz .LBB83_3 +; SI-NEXT: .LBB83_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB83_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB83_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_branch .LBB83_2 +; +; VI-LABEL: bitcast_v4bf16_to_v2f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB83_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB83_4 +; VI-NEXT: .LBB83_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB83_3: +; VI-NEXT: s_branch .LBB83_2 +; VI-NEXT: .LBB83_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_v2f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB83_4 +; GFX9-NEXT: .LBB83_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v2 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v1 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB83_3: +; GFX9-NEXT: s_branch .LBB83_2 +; GFX9-NEXT: .LBB83_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v2f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB83_4 +; GFX11-TRUE16-NEXT: .LBB83_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s1, 16 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v4, v10 :: v_dual_add_nc_u32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v5, v6 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB83_3: +; GFX11-TRUE16-NEXT: s_branch .LBB83_2 +; GFX11-TRUE16-NEXT: .LBB83_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB83_4 +; GFX11-FAKE16-NEXT: .LBB83_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB83_3: +; GFX11-FAKE16-NEXT: s_branch .LBB83_2 +; GFX11-FAKE16-NEXT: .LBB83_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + define <8 x i8> @bitcast_v2f32_to_v8i8(<2 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f32_to_v8i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB42_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB42_4 -; GCN-NEXT: .LBB42_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB42_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_2 -; GCN-NEXT: .LBB42_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f32_to_v8i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB84_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB84_4 +; SI-NEXT: .LBB84_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB84_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_2 +; SI-NEXT: .LBB84_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v8i8: ; VI: ; %bb.0: @@ -5845,7 +11583,7 @@ define <8 x i8> @bitcast_v2f32_to_v8i8(<2 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB84_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5855,7 +11593,7 @@ define <8 x i8> @bitcast_v2f32_to_v8i8(<2 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX11-FAKE16-NEXT: .LBB42_4: ; %end +; GFX11-FAKE16-NEXT: .LBB84_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 @@ -5877,76 +11615,317 @@ end: ret <8 x i8> %phi } +define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f32_to_v8i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB85_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB85_4 +; SI-NEXT: .LBB85_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v4, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB85_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB85_2 +; SI-NEXT: .LBB85_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_v8i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB85_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s5, s17, 24 +; VI-NEXT: s_lshr_b32 s8, s17, 16 +; VI-NEXT: s_lshr_b32 s9, s17, 8 +; VI-NEXT: s_lshr_b32 s10, s16, 16 +; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB85_4 +; VI-NEXT: .LBB85_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v9, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s16, 1.0 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; VI-NEXT: s_branch .LBB85_5 +; VI-NEXT: .LBB85_3: +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: s_branch .LBB85_2 +; VI-NEXT: .LBB85_4: +; VI-NEXT: v_mov_b32_e32 v8, s16 +; VI-NEXT: v_mov_b32_e32 v9, s17 +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_mov_b32_e32 v6, s8 +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB85_5: ; %end +; VI-NEXT: v_mov_b32_e32 v0, v8 +; VI-NEXT: v_mov_b32_e32 v4, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_v8i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s5, s17, 24 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s9, s17, 8 +; GFX9-NEXT: s_lshr_b32 s10, s16, 16 +; GFX9-NEXT: s_lshr_b32 s11, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB85_4 +; GFX9-NEXT: .LBB85_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v9, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s16, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: s_branch .LBB85_5 +; GFX9-NEXT: .LBB85_3: +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr9 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr5 +; GFX9-NEXT: s_branch .LBB85_2 +; GFX9-NEXT: .LBB85_4: +; GFX9-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB85_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v2f32_to_v8i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB85_4 +; GFX11-TRUE16-NEXT: .LBB85_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s0, 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB85_5 +; GFX11-TRUE16-NEXT: .LBB85_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB85_2 +; GFX11-TRUE16-NEXT: .LBB85_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: .LBB85_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v2f32_to_v8i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB85_4 +; GFX11-FAKE16-NEXT: .LBB85_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, s0, 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-FAKE16-NEXT: s_branch .LBB85_5 +; GFX11-FAKE16-NEXT: .LBB85_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: s_branch .LBB85_2 +; GFX11-FAKE16-NEXT: .LBB85_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v6, s5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, s3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-FAKE16-NEXT: .LBB85_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i8_to_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB43_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB43_4 -; GCN-NEXT: .LBB43_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB43_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB43_2 -; GCN-NEXT: .LBB43_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i8_to_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v7 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB86_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB86_4 +; SI-NEXT: .LBB86_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB86_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB86_2 +; SI-NEXT: .LBB86_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i8_to_v2f32: ; VI: ; %bb.0: @@ -5960,14 +11939,14 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: s_cbranch_execnz .LBB86_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB43_4 -; VI-NEXT: .LBB43_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB86_4 +; VI-NEXT: .LBB86_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB43_3: ; %cmp.false +; VI-NEXT: .LBB86_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -5983,8 +11962,8 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr6 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB43_2 -; VI-NEXT: .LBB43_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB86_2 +; VI-NEXT: .LBB86_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v9 ; VI-NEXT: v_add_u16_e32 v1, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -6015,14 +11994,14 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: s_cbranch_execnz .LBB86_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB43_4 -; GFX9-NEXT: .LBB43_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB86_4 +; GFX9-NEXT: .LBB86_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB43_3: ; %cmp.false +; GFX9-NEXT: .LBB86_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -6038,8 +12017,8 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr6 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB43_2 -; GFX9-NEXT: .LBB43_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB86_2 +; GFX9-NEXT: .LBB86_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -6072,14 +12051,14 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_4 -; GFX11-TRUE16-NEXT: .LBB43_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_4 +; GFX11-TRUE16-NEXT: .LBB86_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB43_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l @@ -6108,8 +12087,8 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB43_2 -; GFX11-TRUE16-NEXT: .LBB43_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 +; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 @@ -6157,14 +12136,14 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_4 -; GFX11-FAKE16-NEXT: .LBB43_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_4 +; GFX11-FAKE16-NEXT: .LBB86_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB43_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -6193,8 +12172,8 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB43_2 -; GFX11-FAKE16-NEXT: .LBB43_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB86_2 +; GFX11-FAKE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v9, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v2, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -6245,50 +12224,309 @@ end: ret <2 x float> %phi } +define inreg <2 x float> @bitcast_v8i8_to_v2f32_scalar(<8 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i8_to_v2f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB87_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB87_3 +; SI-NEXT: .LBB87_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: .LBB87_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB87_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_branch .LBB87_2 +; +; VI-LABEL: bitcast_v8i8_to_v2f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB87_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_cbranch_execnz .LBB87_3 +; VI-NEXT: .LBB87_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: s_and_b32 s8, s16, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: .LBB87_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB87_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; VI-NEXT: s_branch .LBB87_2 +; +; GFX9-LABEL: bitcast_v8i8_to_v2f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB87_3 +; GFX9-NEXT: .LBB87_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: .LBB87_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB87_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: s_branch .LBB87_2 +; +; GFX11-LABEL: bitcast_v8i8_to_v2f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB87_3 +; GFX11-NEXT: .LBB87_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s3, 8 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s17, 8 +; GFX11-NEXT: s_and_b32 s4, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s19, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_or_b32 s5, s2, s3 +; GFX11-NEXT: .LBB87_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB87_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: s_branch .LBB87_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + define <4 x half> @bitcast_v4i16_to_v4f16(<4 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i16_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v3 -; GCN-NEXT: v_mov_b32_e32 v5, v2 -; GCN-NEXT: v_mov_b32_e32 v6, v1 -; GCN-NEXT: v_mov_b32_e32 v7, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB44_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB44_4 -; GCN-NEXT: .LBB44_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB44_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB44_2 -; GCN-NEXT: .LBB44_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i16_to_v4f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB88_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB88_4 +; SI-NEXT: .LBB88_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB88_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_2 +; SI-NEXT: .LBB88_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_v4f16: ; VI: ; %bb.0: @@ -6354,40 +12592,148 @@ end: ret <4 x half> %phi } +define inreg <4 x half> @bitcast_v4i16_to_v4f16_scalar(<4 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i16_to_v4f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB89_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: s_cbranch_execnz .LBB89_3 +; SI-NEXT: .LBB89_2: ; %cmp.true +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: .LBB89_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB89_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB89_2 +; +; VI-LABEL: bitcast_v4i16_to_v4f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB89_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB89_3 +; VI-NEXT: .LBB89_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB89_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB89_4: +; VI-NEXT: s_branch .LBB89_2 +; +; GFX9-LABEL: bitcast_v4i16_to_v4f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB89_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB89_4 +; GFX9-NEXT: .LBB89_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB89_3: +; GFX9-NEXT: s_branch .LBB89_2 +; GFX9-NEXT: .LBB89_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i16_to_v4f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB89_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB89_4 +; GFX11-NEXT: .LBB89_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB89_3: +; GFX11-NEXT: s_branch .LBB89_2 +; GFX11-NEXT: .LBB89_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + define <4 x i16> @bitcast_v4f16_to_v4i16(<4 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f16_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB45_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v4 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: .LBB45_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f16_to_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB90_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB90_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v4i16: ; VI: ; %bb.0: @@ -6454,48 +12800,164 @@ end: ret <4 x i16> %phi } +define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f16_to_v4i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB91_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB91_3 +; SI-NEXT: .LBB91_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB91_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB91_4: +; SI-NEXT: s_branch .LBB91_2 +; +; VI-LABEL: bitcast_v4f16_to_v4i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB91_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB91_4 +; VI-NEXT: .LBB91_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_add_f16_e32 v2, s16, v0 +; VI-NEXT: v_add_f16_sdwa v3, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v0 +; VI-NEXT: v_or_b32_e32 v0, v2, v3 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB91_3: +; VI-NEXT: s_branch .LBB91_2 +; VI-NEXT: .LBB91_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_v4i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB91_4 +; GFX9-NEXT: .LBB91_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB91_3: +; GFX9-NEXT: s_branch .LBB91_2 +; GFX9-NEXT: .LBB91_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_v4i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB91_4 +; GFX11-NEXT: .LBB91_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB91_3: +; GFX11-NEXT: s_branch .LBB91_2 +; GFX11-NEXT: .LBB91_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + define <4 x bfloat> @bitcast_v4i16_to_v4bf16(<4 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i16_to_v4bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v2 -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB46_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB46_4 -; GCN-NEXT: .LBB46_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB46_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB46_2 -; GCN-NEXT: .LBB46_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v3, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i16_to_v4bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB92_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB92_4 +; SI-NEXT: .LBB92_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB92_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_2 +; SI-NEXT: .LBB92_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_v4bf16: ; VI: ; %bb.0: @@ -6561,56 +13023,174 @@ end: ret <4 x bfloat> %phi } +define inreg <4 x bfloat> @bitcast_v4i16_to_v4bf16_scalar(<4 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i16_to_v4bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB93_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_lshl_b32 s9, s18, 16 +; SI-NEXT: s_lshl_b32 s8, s19, 16 +; SI-NEXT: s_cbranch_execnz .LBB93_3 +; SI-NEXT: .LBB93_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s6, s17, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_and_b32 s7, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_and_b32 s8, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s4, 16 +; SI-NEXT: .LBB93_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB93_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB93_2 +; +; VI-LABEL: bitcast_v4i16_to_v4bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB93_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB93_3 +; VI-NEXT: .LBB93_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB93_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB93_4: +; VI-NEXT: s_branch .LBB93_2 +; +; GFX9-LABEL: bitcast_v4i16_to_v4bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB93_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB93_4 +; GFX9-NEXT: .LBB93_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB93_3: +; GFX9-NEXT: s_branch .LBB93_2 +; GFX9-NEXT: .LBB93_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i16_to_v4bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB93_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB93_4 +; GFX11-NEXT: .LBB93_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB93_3: +; GFX11-NEXT: s_branch .LBB93_2 +; GFX11-NEXT: .LBB93_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v4bf16_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB47_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB47_4 -; GCN-NEXT: .LBB47_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB47_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB47_2 -; GCN-NEXT: .LBB47_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4bf16_to_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB94_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB94_4 +; SI-NEXT: .LBB94_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB94_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB94_2 +; SI-NEXT: .LBB94_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_v4i16: ; VI: ; %bb.0: @@ -6619,7 +13199,7 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB47_2 +; VI-NEXT: s_cbranch_execz .LBB94_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -6658,7 +13238,7 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_alignbit_b32 v1, v1, v3, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: .LBB47_2: ; %end +; VI-NEXT: .LBB94_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6669,7 +13249,7 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB47_2 +; GFX9-NEXT: s_cbranch_execz .LBB94_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -6703,7 +13283,7 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_mov_b32 s6, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s6 ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 -; GFX9-NEXT: .LBB47_2: ; %end +; GFX9-NEXT: .LBB94_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6715,7 +13295,7 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 @@ -6760,7 +13340,7 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; GFX11-TRUE16-NEXT: .LBB47_2: ; %end +; GFX11-TRUE16-NEXT: .LBB94_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6772,44 +13352,318 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 +; GFX11-FAKE16-NEXT: .LBB94_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + +define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4bf16_to_v4i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s19 +; SI-NEXT: s_cbranch_scc0 .LBB95_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: s_cbranch_execnz .LBB95_3 +; SI-NEXT: .LBB95_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB95_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB95_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB95_2 +; +; VI-LABEL: bitcast_v4bf16_to_v4i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB95_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB95_4 +; VI-NEXT: .LBB95_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB95_3: +; VI-NEXT: s_branch .LBB95_2 +; VI-NEXT: .LBB95_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_v4i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB95_4 +; GFX9-NEXT: .LBB95_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v0, v3, v4, v0 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB95_3: +; GFX9-NEXT: s_branch .LBB95_2 +; GFX9-NEXT: .LBB95_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v4i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB95_4 +; GFX11-TRUE16-NEXT: .LBB95_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v3, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB95_3: +; GFX11-TRUE16-NEXT: s_branch .LBB95_2 +; GFX11-TRUE16-NEXT: .LBB95_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v4i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB95_4 +; GFX11-FAKE16-NEXT: .LBB95_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB47_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v3, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB95_3: +; GFX11-FAKE16-NEXT: s_branch .LBB95_2 +; GFX11-FAKE16-NEXT: .LBB95_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6829,66 +13683,66 @@ end: } define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i16_to_v8i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v3 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v10 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB48_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB48_4 -; GCN-NEXT: .LBB48_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB48_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v11 -; GCN-NEXT: v_or_b32_e32 v4, v1, v12 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_bfe_u32 v7, v10, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB48_2 -; GCN-NEXT: .LBB48_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v11, v0 -; GCN-NEXT: v_or_b32_e32 v1, v12, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v1 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i16_to_v8i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB96_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB96_4 +; SI-NEXT: .LBB96_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB96_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v4, v1, v11 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v10 +; SI-NEXT: v_bfe_u32 v7, v10, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB96_2 +; SI-NEXT: .LBB96_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_v8i8: ; VI: ; %bb.0: @@ -6914,7 +13768,7 @@ define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; VI-NEXT: ; %bb.2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: s_cbranch_execz .LBB96_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v2, 3 ; VI-NEXT: v_add_u16_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -6929,7 +13783,7 @@ define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; VI-NEXT: .LBB48_4: ; %end +; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v9 ; VI-NEXT: v_mov_b32_e32 v1, v4 @@ -7033,7 +13887,7 @@ define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] @@ -7044,7 +13898,7 @@ define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX11-FAKE16-NEXT: .LBB48_4: ; %end +; GFX11-FAKE16-NEXT: .LBB96_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 @@ -7066,84 +13920,345 @@ end: ret <8 x i8> %phi } +define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i16_to_v8i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB97_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_alignbit_b32 v3, s7, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s7, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 8 +; SI-NEXT: s_lshr_b32 s9, s7, 8 +; SI-NEXT: s_and_b32 s10, s19, 0xffff +; SI-NEXT: s_bfe_u32 s8, s19, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB97_3 +; SI-NEXT: .LBB97_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_alignbit_b32 v3, s7, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s7, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 8 +; SI-NEXT: s_lshr_b32 s8, s7, 24 +; SI-NEXT: s_lshr_b32 s10, s7, 16 +; SI-NEXT: s_lshr_b32 s9, s7, 8 +; SI-NEXT: .LBB97_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB97_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB97_2 +; +; VI-LABEL: bitcast_v4i16_to_v8i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB97_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s9, s17, 24 +; VI-NEXT: s_lshr_b32 s8, s17, 16 +; VI-NEXT: s_lshr_b32 s5, s17, 8 +; VI-NEXT: s_lshr_b32 s10, s16, 16 +; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB97_3 +; VI-NEXT: .LBB97_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: s_add_i32 s8, s4, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s5, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff +; VI-NEXT: s_lshl_b32 s6, s8, 16 +; VI-NEXT: s_or_b32 s7, s4, s6 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 +; VI-NEXT: s_lshr_b64 s[4:5], s[6:7], 24 +; VI-NEXT: s_lshr_b32 s5, s7, 8 +; VI-NEXT: s_lshr_b32 s10, s6, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 8 +; VI-NEXT: s_bfe_u32 s9, s8, 0x80008 +; VI-NEXT: .LBB97_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v6, s8 +; VI-NEXT: v_mov_b32_e32 v7, s9 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB97_4: +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: s_branch .LBB97_2 +; +; GFX9-LABEL: bitcast_v4i16_to_v8i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s5, s17, 24 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s9, s17, 8 +; GFX9-NEXT: s_lshr_b32 s10, s16, 16 +; GFX9-NEXT: s_lshr_b32 s11, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB97_4 +; GFX9-NEXT: .LBB97_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v9, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: s_branch .LBB97_5 +; GFX9-NEXT: .LBB97_3: +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr9 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr5 +; GFX9-NEXT: s_branch .LBB97_2 +; GFX9-NEXT: .LBB97_4: +; GFX9-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB97_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4i16_to_v8i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB97_4 +; GFX11-TRUE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB97_5 +; GFX11-TRUE16-NEXT: .LBB97_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB97_2 +; GFX11-TRUE16-NEXT: .LBB97_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: .LBB97_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4i16_to_v8i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB97_4 +; GFX11-FAKE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-FAKE16-NEXT: s_branch .LBB97_5 +; GFX11-FAKE16-NEXT: .LBB97_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: s_branch .LBB97_2 +; GFX11-FAKE16-NEXT: .LBB97_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v6, s5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, s3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-FAKE16-NEXT: .LBB97_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i8_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v2 -; GCN-NEXT: v_mov_b32_e32 v10, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB49_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB49_4 -; GCN-NEXT: .LBB49_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB49_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v3, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v4, v7, v1 -; GCN-NEXT: v_or_b32_e32 v1, v5, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v2, v0, v4 -; GCN-NEXT: v_or_b32_e32 v0, v3, v1 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_2 -; GCN-NEXT: .LBB49_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v11, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v5, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v1 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i8_to_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB98_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB98_4 +; SI-NEXT: .LBB98_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB98_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v5, v1 +; SI-NEXT: v_or_b32_e32 v2, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v4, v4, v8 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_2 +; SI-NEXT: .LBB98_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v9 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i8_to_v4i16: ; VI: ; %bb.0: @@ -7157,14 +14272,14 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: s_cbranch_execnz .LBB98_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB49_4 -; VI-NEXT: .LBB49_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB98_4 +; VI-NEXT: .LBB98_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB49_3: ; %cmp.false +; VI-NEXT: .LBB98_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -7180,8 +14295,8 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr6 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB49_2 -; VI-NEXT: .LBB49_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB98_2 +; VI-NEXT: .LBB98_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v9 ; VI-NEXT: v_add_u16_e32 v1, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -7212,14 +14327,14 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: s_cbranch_execnz .LBB98_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB49_4 -; GFX9-NEXT: .LBB49_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB98_4 +; GFX9-NEXT: .LBB98_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB49_3: ; %cmp.false +; GFX9-NEXT: .LBB98_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -7235,8 +14350,8 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr6 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB49_2 -; GFX9-NEXT: .LBB49_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB98_2 +; GFX9-NEXT: .LBB98_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -7269,14 +14384,14 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_4 -; GFX11-TRUE16-NEXT: .LBB49_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_4 +; GFX11-TRUE16-NEXT: .LBB98_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB49_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l @@ -7305,8 +14420,8 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-TRUE16-NEXT: .LBB49_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 @@ -7354,14 +14469,14 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB98_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_4 -; GFX11-FAKE16-NEXT: .LBB49_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB98_4 +; GFX11-FAKE16-NEXT: .LBB98_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB49_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -7390,8 +14505,8 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-FAKE16-NEXT: .LBB49_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-FAKE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v9, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v2, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -7442,58 +14557,327 @@ end: ret <4 x i16> %phi } +define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i8_to_v4i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB99_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_or_b32 s4, s6, s4 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s8, s17, 8 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 16 +; SI-NEXT: s_or_b32 s6, s6, s4 +; SI-NEXT: s_lshr_b32 s8, s5, 16 +; SI-NEXT: s_cbranch_execnz .LBB99_3 +; SI-NEXT: .LBB99_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s7, s4, 0x3000000 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 16 +; SI-NEXT: s_lshr_b32 s8, s7, 16 +; SI-NEXT: .LBB99_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB99_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB99_2 +; +; VI-LABEL: bitcast_v8i8_to_v4i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB99_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_cbranch_execnz .LBB99_3 +; VI-NEXT: .LBB99_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: s_and_b32 s8, s16, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: .LBB99_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB99_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; VI-NEXT: s_branch .LBB99_2 +; +; GFX9-LABEL: bitcast_v8i8_to_v4i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB99_3 +; GFX9-NEXT: .LBB99_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: .LBB99_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB99_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: s_branch .LBB99_2 +; +; GFX11-LABEL: bitcast_v8i8_to_v4i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB99_3 +; GFX11-NEXT: .LBB99_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s3, 8 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s17, 8 +; GFX11-NEXT: s_and_b32 s4, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s19, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_or_b32 s5, s2, s3 +; GFX11-NEXT: .LBB99_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB99_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: s_branch .LBB99_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + define <4 x bfloat> @bitcast_v4f16_to_v4bf16(<4 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f16_to_v4bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB50_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB50_4 -; GCN-NEXT: .LBB50_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB50_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB50_2 -; GCN-NEXT: .LBB50_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f16_to_v4bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB100_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB100_4 +; SI-NEXT: .LBB100_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB100_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_2 +; SI-NEXT: .LBB100_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v4bf16: ; VI: ; %bb.0: @@ -7560,62 +14944,185 @@ end: ret <4 x bfloat> %phi } +define inreg <4 x bfloat> @bitcast_v4f16_to_v4bf16_scalar(<4 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f16_to_v4bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s19 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: .LBB101_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: .LBB101_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB101_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB101_2 +; +; VI-LABEL: bitcast_v4f16_to_v4bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB101_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB101_4 +; VI-NEXT: .LBB101_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_add_f16_e32 v2, s16, v0 +; VI-NEXT: v_add_f16_sdwa v3, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v0 +; VI-NEXT: v_or_b32_e32 v0, v2, v3 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB101_3: +; VI-NEXT: s_branch .LBB101_2 +; VI-NEXT: .LBB101_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_v4bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB101_4 +; GFX9-NEXT: .LBB101_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB101_3: +; GFX9-NEXT: s_branch .LBB101_2 +; GFX9-NEXT: .LBB101_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_v4bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB101_4 +; GFX11-NEXT: .LBB101_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB101_3: +; GFX11-NEXT: s_branch .LBB101_2 +; GFX11-NEXT: .LBB101_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v4bf16_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB51_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB51_4 -; GCN-NEXT: .LBB51_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB51_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB51_2 -; GCN-NEXT: .LBB51_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4bf16_to_v4f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB102_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB102_4 +; SI-NEXT: .LBB102_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB102_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB102_2 +; SI-NEXT: .LBB102_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_v4f16: ; VI: ; %bb.0: @@ -7624,7 +15131,7 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB51_2 +; VI-NEXT: s_cbranch_execz .LBB102_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -7663,7 +15170,7 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_alignbit_b32 v1, v1, v3, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: .LBB51_2: ; %end +; VI-NEXT: .LBB102_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7674,7 +15181,7 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB51_2 +; GFX9-NEXT: s_cbranch_execz .LBB102_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -7708,7 +15215,7 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_mov_b32 s6, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s6 ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 -; GFX9-NEXT: .LBB51_2: ; %end +; GFX9-NEXT: .LBB102_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7720,7 +15227,7 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -7762,7 +15269,7 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v3, v0 -; GFX11-TRUE16-NEXT: .LBB51_2: ; %end +; GFX11-TRUE16-NEXT: .LBB102_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7774,7 +15281,7 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -7794,24 +15301,316 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 +; GFX11-FAKE16-NEXT: .LBB102_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + +define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4bf16_to_v4f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 +; SI-NEXT: s_cbranch_scc0 .LBB103_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_cbranch_execnz .LBB103_3 +; SI-NEXT: .LBB103_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: .LBB103_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB103_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB103_2 +; +; VI-LABEL: bitcast_v4bf16_to_v4f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB103_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB103_4 +; VI-NEXT: .LBB103_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB103_3: +; VI-NEXT: s_branch .LBB103_2 +; VI-NEXT: .LBB103_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_v4f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB103_4 +; GFX9-NEXT: .LBB103_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB103_3: +; GFX9-NEXT: s_branch .LBB103_2 +; GFX9-NEXT: .LBB103_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v4f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB103_4 +; GFX11-TRUE16-NEXT: .LBB103_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v8 :: v_dual_and_b32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB103_3: +; GFX11-TRUE16-NEXT: s_branch .LBB103_2 +; GFX11-TRUE16-NEXT: .LBB103_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v4f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB103_4 +; GFX11-FAKE16-NEXT: .LBB103_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB51_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v4, v8 :: v_dual_and_b32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB103_3: +; GFX11-FAKE16-NEXT: s_branch .LBB103_2 +; GFX11-FAKE16-NEXT: .LBB103_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7831,69 +15630,69 @@ end: } define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f16_to_v8i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB52_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB52_4 -; GCN-NEXT: .LBB52_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB52_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: v_or_b32_e32 v0, v9, v0 -; GCN-NEXT: v_or_b32_e32 v4, v8, v1 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB52_2 -; GCN-NEXT: .LBB52_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v4, v2, v3 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f16_to_v8i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB104_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB104_4 +; SI-NEXT: .LBB104_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB104_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_or_b32_e32 v4, v8, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB104_2 +; SI-NEXT: .LBB104_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v2, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v8i8: ; VI: ; %bb.0: @@ -7916,7 +15715,7 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; VI-NEXT: ; %bb.2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB52_4 +; VI-NEXT: s_cbranch_execz .LBB104_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v0, 0x200 ; VI-NEXT: v_add_f16_sdwa v6, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -7931,7 +15730,7 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; VI-NEXT: .LBB52_4: ; %end +; VI-NEXT: .LBB104_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v8 ; VI-NEXT: v_mov_b32_e32 v4, v9 @@ -7960,7 +15759,7 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX9-NEXT: ; %bb.2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB52_4 +; GFX9-NEXT: s_cbranch_execz .LBB104_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] @@ -7971,7 +15770,7 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX9-NEXT: .LBB52_4: ; %end +; GFX9-NEXT: .LBB104_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-NEXT: v_mov_b32_e32 v4, v9 @@ -8036,7 +15835,7 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB104_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] @@ -8047,7 +15846,7 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX11-FAKE16-NEXT: .LBB52_4: ; %end +; GFX11-FAKE16-NEXT: .LBB104_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 @@ -8069,76 +15868,337 @@ end: ret <8 x i8> %phi } +define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f16_to_v8i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB105_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_or_b32_e32 v4, v8, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: s_cbranch_execnz .LBB105_3 +; SI-NEXT: .LBB105_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v2, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: .LBB105_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB105_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB105_2 +; +; VI-LABEL: bitcast_v4f16_to_v8i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB105_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s8, s17, 24 +; VI-NEXT: s_lshr_b32 s10, s17, 16 +; VI-NEXT: s_lshr_b32 s5, s17, 8 +; VI-NEXT: s_lshr_b32 s11, s16, 16 +; VI-NEXT: s_lshr_b32 s9, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB105_4 +; VI-NEXT: .LBB105_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_add_f16_e32 v6, s4, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; VI-NEXT: v_add_f16_e32 v8, s17, v0 +; VI-NEXT: v_add_f16_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v10, v8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v9, v0, v1 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[9:10] +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v9 +; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; VI-NEXT: v_mov_b32_e32 v4, v8 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB105_3: +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: s_branch .LBB105_2 +; VI-NEXT: .LBB105_4: +; VI-NEXT: v_mov_b32_e32 v2, s11 +; VI-NEXT: v_mov_b32_e32 v6, s10 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v7, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_v8i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s5, s17, 24 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s9, s17, 8 +; GFX9-NEXT: s_lshr_b32 s10, s16, 16 +; GFX9-NEXT: s_lshr_b32 s11, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB105_4 +; GFX9-NEXT: .LBB105_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: s_branch .LBB105_5 +; GFX9-NEXT: .LBB105_3: +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr9 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr5 +; GFX9-NEXT: s_branch .LBB105_2 +; GFX9-NEXT: .LBB105_4: +; GFX9-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB105_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4f16_to_v8i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-TRUE16-NEXT: .LBB105_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB105_5 +; GFX11-TRUE16-NEXT: .LBB105_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB105_2 +; GFX11-TRUE16-NEXT: .LBB105_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: .LBB105_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4f16_to_v8i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-FAKE16-NEXT: .LBB105_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-FAKE16-NEXT: s_branch .LBB105_5 +; GFX11-FAKE16-NEXT: .LBB105_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: s_branch .LBB105_2 +; GFX11-FAKE16-NEXT: .LBB105_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v6, s5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, s3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-FAKE16-NEXT: .LBB105_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i8_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB53_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB53_4 -; GCN-NEXT: .LBB53_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB53_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v12 -; GCN-NEXT: v_or_b32_e32 v1, v1, v11 -; GCN-NEXT: v_or_b32_e32 v2, v2, v8 -; GCN-NEXT: v_or_b32_e32 v3, v3, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_2 -; GCN-NEXT: .LBB53_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v9 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v5, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v11, v2 -; GCN-NEXT: v_or_b32_e32 v3, v12, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x300, v0 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x300, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i8_to_v4f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v2 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB106_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB106_4 +; SI-NEXT: .LBB106_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB106_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_2 +; SI-NEXT: .LBB106_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i8_to_v4f16: ; VI: ; %bb.0: @@ -8152,14 +16212,14 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: s_cbranch_execnz .LBB106_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB53_4 -; VI-NEXT: .LBB53_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB106_4 +; VI-NEXT: .LBB106_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB53_3: ; %cmp.false +; VI-NEXT: .LBB106_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -8175,8 +16235,8 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr6 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB53_2 -; VI-NEXT: .LBB53_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB106_2 +; VI-NEXT: .LBB106_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v9 ; VI-NEXT: v_add_u16_e32 v1, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -8207,14 +16267,14 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: s_cbranch_execnz .LBB106_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB53_4 -; GFX9-NEXT: .LBB53_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB106_4 +; GFX9-NEXT: .LBB106_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB53_3: ; %cmp.false +; GFX9-NEXT: .LBB106_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -8230,8 +16290,8 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr6 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB53_2 -; GFX9-NEXT: .LBB53_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB106_2 +; GFX9-NEXT: .LBB106_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -8264,14 +16324,14 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_4 -; GFX11-TRUE16-NEXT: .LBB53_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_4 +; GFX11-TRUE16-NEXT: .LBB106_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB53_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l @@ -8300,8 +16360,8 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB53_2 -; GFX11-TRUE16-NEXT: .LBB53_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 +; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 @@ -8349,14 +16409,14 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB106_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_4 -; GFX11-FAKE16-NEXT: .LBB53_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB106_4 +; GFX11-FAKE16-NEXT: .LBB106_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB53_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -8385,8 +16445,8 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB53_2 -; GFX11-FAKE16-NEXT: .LBB53_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB106_2 +; GFX11-FAKE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v9, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v2, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -8437,68 +16497,324 @@ end: ret <4 x half> %phi } +define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i8_to_v4f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB107_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_cbranch_execnz .LBB107_3 +; SI-NEXT: .LBB107_2: ; %cmp.true +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_lshl_b32 s7, s19, 8 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s16, 0xff +; SI-NEXT: s_lshl_b32 s8, s17, 8 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: .LBB107_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB107_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB107_2 +; +; VI-LABEL: bitcast_v8i8_to_v4f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB107_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_cbranch_execnz .LBB107_3 +; VI-NEXT: .LBB107_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: s_and_b32 s8, s16, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: .LBB107_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB107_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; VI-NEXT: s_branch .LBB107_2 +; +; GFX9-LABEL: bitcast_v8i8_to_v4f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB107_3 +; GFX9-NEXT: .LBB107_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: .LBB107_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB107_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: s_branch .LBB107_2 +; +; GFX11-LABEL: bitcast_v8i8_to_v4f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB107_3 +; GFX11-NEXT: .LBB107_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s3, 8 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s17, 8 +; GFX11-NEXT: s_and_b32 s4, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s19, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_or_b32 s5, s2, s3 +; GFX11-NEXT: .LBB107_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB107_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: s_branch .LBB107_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v4bf16_to_v8i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB54_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB54_4 -; GCN-NEXT: .LBB54_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB54_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; GCN-NEXT: v_alignbit_b32 v0, v0, v10, 16 -; GCN-NEXT: v_alignbit_b32 v4, v6, v8, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB54_2 -; GCN-NEXT: .LBB54_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v4, v6, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4bf16_to_v8i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB108_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB108_4 +; SI-NEXT: .LBB108_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB108_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v9, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB108_2 +; SI-NEXT: .LBB108_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_v8i8: ; VI: ; %bb.0: @@ -8523,7 +16839,7 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; VI-NEXT: ; %bb.2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB54_4 +; VI-NEXT: s_cbranch_execz .LBB108_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -8569,7 +16885,7 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; VI-NEXT: .LBB54_4: ; %end +; VI-NEXT: .LBB108_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v8 ; VI-NEXT: v_mov_b32_e32 v4, v9 @@ -8598,7 +16914,7 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX9-NEXT: ; %bb.2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB54_4 +; GFX9-NEXT: s_cbranch_execz .LBB108_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v8 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -8640,7 +16956,7 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v11 -; GFX9-NEXT: .LBB54_4: ; %end +; GFX9-NEXT: .LBB108_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-NEXT: v_mov_b32_e32 v4, v9 @@ -8660,7 +16976,7 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v2 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[8:9] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 @@ -8669,9 +16985,9 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v9.h -; GFX11-TRUE16-NEXT: .LBB54_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v8.l @@ -8715,7 +17031,7 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[8:9] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 -; GFX11-TRUE16-NEXT: .LBB54_4: ; %end +; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v8.h @@ -8748,7 +17064,7 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8792,7 +17108,7 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 -; GFX11-FAKE16-NEXT: .LBB54_4: ; %end +; GFX11-FAKE16-NEXT: .LBB108_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 @@ -8814,80 +17130,482 @@ end: ret <8 x i8> %phi } +define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4bf16_to_v8i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: s_cbranch_scc0 .LBB109_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v9, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_cbranch_execnz .LBB109_3 +; SI-NEXT: .LBB109_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB109_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB109_2 +; +; VI-LABEL: bitcast_v4bf16_to_v8i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB109_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s8, s17, 24 +; VI-NEXT: s_lshr_b32 s5, s17, 16 +; VI-NEXT: s_lshr_b32 s9, s17, 8 +; VI-NEXT: s_lshr_b32 s10, s16, 16 +; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB109_4 +; VI-NEXT: .LBB109_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v2, v6, v3, 16 +; VI-NEXT: v_alignbit_b32 v1, v0, v4, 16 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_mov_b32_e32 v4, v8 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB109_3: +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: s_branch .LBB109_2 +; VI-NEXT: .LBB109_4: +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_mov_b32_e32 v7, s8 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_v8i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s9, s17, 24 +; GFX9-NEXT: s_lshr_b32 s11, s17, 16 +; GFX9-NEXT: s_lshr_b32 s10, s17, 8 +; GFX9-NEXT: s_lshr_b32 s8, s16, 16 +; GFX9-NEXT: s_lshr_b32 s5, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB109_4 +; GFX9-NEXT: .LBB109_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v9, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v1 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v10 +; GFX9-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB109_3: +; GFX9-NEXT: ; implicit-def: $sgpr5 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr9 +; GFX9-NEXT: s_branch .LBB109_2 +; GFX9-NEXT: .LBB109_4: +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v8i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB109_4 +; GFX11-TRUE16-NEXT: .LBB109_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s1, 16 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v4, v10 :: v_dual_add_nc_u32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v6, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[10:11] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB109_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB109_2 +; GFX11-TRUE16-NEXT: .LBB109_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v8i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB109_4 +; GFX11-FAKE16-NEXT: .LBB109_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, v3, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v2, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v6, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v10 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB109_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB109_2 +; GFX11-FAKE16-NEXT: .LBB109_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i8_to_v4bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v1 -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v3, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v7, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: .LBB55_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v10 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v11, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v7, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 -; GCN-NEXT: v_or_b32_e32 v2, v8, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GCN-NEXT: .LBB55_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v5 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i8_to_v4bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v10 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v3, v5, v2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: .LBB110_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v10 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: .LBB110_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v7 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i8_to_v4bf16: ; VI: ; %bb.0: @@ -8901,14 +17619,14 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: s_cbranch_execnz .LBB110_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB55_4 -; VI-NEXT: .LBB55_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB110_4 +; VI-NEXT: .LBB110_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB55_3: ; %cmp.false +; VI-NEXT: .LBB110_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -8924,8 +17642,8 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr6 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB55_2 -; VI-NEXT: .LBB55_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB110_2 +; VI-NEXT: .LBB110_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v9 ; VI-NEXT: v_add_u16_e32 v1, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -8956,14 +17674,14 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: s_cbranch_execnz .LBB110_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB55_4 -; GFX9-NEXT: .LBB55_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB110_4 +; GFX9-NEXT: .LBB110_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB55_3: ; %cmp.false +; GFX9-NEXT: .LBB110_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -8979,8 +17697,8 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr6 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB55_2 -; GFX9-NEXT: .LBB55_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB110_2 +; GFX9-NEXT: .LBB110_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -9013,14 +17731,14 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_4 -; GFX11-TRUE16-NEXT: .LBB55_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_4 +; GFX11-TRUE16-NEXT: .LBB110_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB55_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l @@ -9049,8 +17767,8 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB55_2 -; GFX11-TRUE16-NEXT: .LBB55_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 +; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 @@ -9098,14 +17816,14 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB110_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_4 -; GFX11-FAKE16-NEXT: .LBB55_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB110_4 +; GFX11-FAKE16-NEXT: .LBB110_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB55_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -9134,8 +17852,8 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB55_2 -; GFX11-FAKE16-NEXT: .LBB55_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB110_2 +; GFX11-FAKE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v9, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v2, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -9185,3 +17903,269 @@ end: %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <4 x bfloat> %phi } + +define inreg <4 x bfloat> @bitcast_v8i8_to_v4bf16_scalar(<8 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i8_to_v4bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB111_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s17, 24 +; SI-NEXT: s_or_b32 s6, s5, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_or_b32 s9, s5, s4 +; SI-NEXT: s_cbranch_execnz .LBB111_3 +; SI-NEXT: .LBB111_2: ; %cmp.true +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s16, 0xff +; SI-NEXT: s_lshl_b32 s6, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s18, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_and_b32 s7, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_and_b32 s9, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: .LBB111_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB111_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: s_branch .LBB111_2 +; +; VI-LABEL: bitcast_v8i8_to_v4bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB111_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_cbranch_execnz .LBB111_3 +; VI-NEXT: .LBB111_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s17, 8 +; VI-NEXT: s_and_b32 s8, s16, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: .LBB111_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB111_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; VI-NEXT: s_branch .LBB111_2 +; +; GFX9-LABEL: bitcast_v8i8_to_v4bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB111_3 +; GFX9-NEXT: .LBB111_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: .LBB111_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB111_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: s_branch .LBB111_2 +; +; GFX11-LABEL: bitcast_v8i8_to_v4bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX11-NEXT: .LBB111_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s3, 8 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s17, 8 +; GFX11-NEXT: s_and_b32 s4, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s19, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_or_b32 s5, s2, s3 +; GFX11-NEXT: .LBB111_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB111_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: s_branch .LBB111_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index 08590a3af70f5..051c60e59acc6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -1,46 +1,46 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <22 x float> @bitcast_v22i32_to_v22f32(<22 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v22i32_to_v22f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22i32_to_v22f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i32_to_v22f32: ; VI: ; %bb.0: @@ -164,41 +164,286 @@ end: ret <22 x float> %phi } +define inreg <22 x float> @bitcast_v22i32_to_v22f32_scalar(<22 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22i32_to_v22f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v22i32_to_v22f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v22i32_to_v22f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v22i32_to_v22f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_3: +; GFX11-NEXT: .LBB1_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <22 x i32> %a, splat (i32 3) + %a2 = bitcast <22 x i32> %a1 to <22 x float> + br label %end + +cmp.false: + %a3 = bitcast <22 x i32> %a to <22 x float> + br label %end + +end: + %phi = phi <22 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x float> %phi +} + define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v22f32_to_v22i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22f32_to_v22i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f32_to_v22i32: ; VI: ; %bb.0: @@ -207,7 +452,7 @@ define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -231,7 +476,7 @@ define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -242,7 +487,7 @@ define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 ; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -266,7 +511,7 @@ define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -278,7 +523,7 @@ define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 ; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 @@ -291,7 +536,7 @@ define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -311,41 +556,275 @@ end: ret <22 x i32> %phi } +define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22f32_to_v22i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v22f32_to_v22i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v22f32_to_v22i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v22f32_to_v22i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB3_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: .LBB3_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <22 x float> %a1 to <22 x i32> + br label %end + +cmp.false: + %a3 = bitcast <22 x float> %a to <22 x i32> + br label %end + +end: + %phi = phi <22 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x i32> %phi +} + define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v22i32_to_v11i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22i32_to_v11i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i32_to_v11i64: ; VI: ; %bb.0: @@ -354,7 +833,7 @@ define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 @@ -378,7 +857,7 @@ define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -389,7 +868,7 @@ define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 ; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 @@ -413,7 +892,7 @@ define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -425,7 +904,7 @@ define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 ; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 @@ -449,7 +928,7 @@ define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -469,109 +948,354 @@ end: ret <11 x i64> %phi } -define <22 x i32> @bitcast_v11i64_to_v22i32(<11 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v11i64_to_v22i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <11 x i64> @bitcast_v22i32_to_v11i64_scalar(<22 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22i32_to_v11i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 ; -; VI-LABEL: bitcast_v11i64_to_v22i32: +; VI-LABEL: bitcast_v22i32_to_v11i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB5_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 ; -; GFX9-LABEL: bitcast_v11i64_to_v22i32: +; GFX9-LABEL: bitcast_v22i32_to_v11i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v22i32_to_v11i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_3: +; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <22 x i32> %a, splat (i32 3) + %a2 = bitcast <22 x i32> %a1 to <11 x i64> + br label %end + +cmp.false: + %a3 = bitcast <22 x i32> %a to <11 x i64> + br label %end + +end: + %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i64> %phi +} + +define <22 x i32> @bitcast_v11i64_to_v22i32(<11 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v11i64_to_v22i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v11i64_to_v22i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v11i64_to_v22i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -583,7 +1307,7 @@ define <22 x i32> @bitcast_v11i64_to_v22i32(<11 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -613,7 +1337,7 @@ define <22 x i32> @bitcast_v11i64_to_v22i32(<11 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -633,41 +1357,292 @@ end: ret <22 x i32> %phi } +define inreg <22 x i32> @bitcast_v11i64_to_v22i32_scalar(<11 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11i64_to_v22i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v11i64_to_v22i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v11i64_to_v22i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v11i64_to_v22i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB7_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: .LBB7_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <11 x i64> %a, splat (i64 3) + %a2 = bitcast <11 x i64> %a1 to <22 x i32> + br label %end + +cmp.false: + %a3 = bitcast <11 x i64> %a to <22 x i32> + br label %end + +end: + %phi = phi <22 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x i32> %phi +} + define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v22i32_to_v11f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22i32_to_v11f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i32_to_v11f64: ; VI: ; %bb.0: @@ -676,7 +1651,7 @@ define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 @@ -700,7 +1675,7 @@ define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -711,7 +1686,7 @@ define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 ; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 @@ -735,7 +1710,7 @@ define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -747,7 +1722,7 @@ define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 ; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 @@ -771,7 +1746,7 @@ define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -791,52 +1766,297 @@ end: ret <11 x double> %phi } -define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v11f64_to_v22i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <11 x double> @bitcast_v22i32_to_v11f64_scalar(<22 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22i32_to_v11f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 ; -; VI-LABEL: bitcast_v11f64_to_v22i32: +; VI-LABEL: bitcast_v22i32_to_v11f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v22i32_to_v11f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v22i32_to_v11f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB9_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: .LBB9_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <22 x i32> %a, splat (i32 3) + %a2 = bitcast <22 x i32> %a1 to <11 x double> + br label %end + +cmp.false: + %a3 = bitcast <22 x i32> %a to <11 x double> + br label %end + +end: + %phi = phi <11 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x double> %phi +} + +define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v11f64_to_v22i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v11f64_to_v22i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -847,7 +2067,7 @@ define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -860,7 +2080,7 @@ define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -872,7 +2092,7 @@ define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -885,7 +2105,7 @@ define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -905,220 +2125,442 @@ end: ret <22 x i32> %phi } +define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11f64_to_v22i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v11f64_to_v22i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v10, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v11f64_to_v22i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v11f64_to_v22i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB11_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: .LBB11_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <11 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <11 x double> %a1 to <22 x i32> + br label %end + +cmp.false: + %a3 = bitcast <11 x double> %a to <22 x i32> + br label %end + +end: + %phi = phi <22 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x i32> %phi +} + define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v22i32_to_v44i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v37, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v37, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v1, v1, v49 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v52 -; GCN-NEXT: v_or_b32_e32 v2, v2, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v3, v3, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v5, v5, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v7, v7, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v8, v8, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v9, v9, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v10, v10, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v11, v11, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v12, v12, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v13, v13, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v14, v14, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v15, v15, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v16, v16, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v17, v17, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v20, v20, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v21, v21, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v22, v22, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x54, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v27, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22i32_to_v44i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i32_to_v44i16: ; VI: ; %bb.0: @@ -1148,7 +2590,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -1172,9 +2614,9 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB6_2: ; %Flow +; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_4 +; VI-NEXT: s_cbranch_execz .LBB12_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 @@ -1220,7 +2662,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB6_4: ; %end +; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 @@ -1296,7 +2738,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -1320,9 +2762,9 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB6_2: ; %Flow +; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 ; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 @@ -1368,7 +2810,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB6_4: ; %end +; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 @@ -1403,7 +2845,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 @@ -1427,7 +2869,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: .LBB12_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1460,7 +2902,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -1484,9 +2926,9 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 @@ -1532,7 +2974,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 @@ -1575,372 +3017,1283 @@ end: ret <44 x i16> %phi } +define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22i32_to_v44i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v7 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s28 +; SI-NEXT: v_mov_b32_e32 v6, s26 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v8, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s29, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s27, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s25, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s23, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s19, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s17, v11, 16 +; SI-NEXT: s_lshr_b32 s14, s6, 16 +; SI-NEXT: s_lshr_b32 s15, s8, 16 +; SI-NEXT: s_lshr_b32 s40, s10, 16 +; SI-NEXT: s_lshr_b32 s41, s12, 16 +; SI-NEXT: s_lshr_b32 s42, s29, 16 +; SI-NEXT: s_lshr_b32 s43, s27, 16 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s47, s19, 16 +; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s28 +; SI-NEXT: v_mov_b32_e32 v6, s26 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v8, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s29, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s27, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s25, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s23, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s19, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s17, v11, 16 +; SI-NEXT: s_lshr_b32 s14, s6, 16 +; SI-NEXT: s_lshr_b32 s15, s8, 16 +; SI-NEXT: s_lshr_b32 s40, s10, 16 +; SI-NEXT: s_lshr_b32 s41, s12, 16 +; SI-NEXT: s_lshr_b32 s42, s29, 16 +; SI-NEXT: s_lshr_b32 s43, s27, 16 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s47, s19, 16 +; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v22i32_to_v44i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v0 +; VI-NEXT: v_readfirstlane_b32 s12, v1 +; VI-NEXT: v_readfirstlane_b32 s11, v2 +; VI-NEXT: v_readfirstlane_b32 s10, v3 +; VI-NEXT: v_readfirstlane_b32 s9, v4 +; VI-NEXT: v_readfirstlane_b32 s8, v5 +; VI-NEXT: v_readfirstlane_b32 s6, v6 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v7 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s6, 16 +; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshr_b32 s42, s10, 16 +; VI-NEXT: s_lshr_b32 s43, s11, 16 +; VI-NEXT: s_lshr_b32 s44, s12, 16 +; VI-NEXT: s_lshr_b32 s45, s13, 16 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_lshr_b32 s47, s28, 16 +; VI-NEXT: s_lshr_b32 s56, s27, 16 +; VI-NEXT: s_lshr_b32 s57, s26, 16 +; VI-NEXT: s_lshr_b32 s58, s25, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s23, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 16 +; VI-NEXT: s_lshr_b32 s62, s21, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 16 +; VI-NEXT: s_lshr_b32 s72, s19, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s6, 16 +; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshr_b32 s42, s10, 16 +; VI-NEXT: s_lshr_b32 s43, s11, 16 +; VI-NEXT: s_lshr_b32 s44, s12, 16 +; VI-NEXT: s_lshr_b32 s45, s13, 16 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_lshr_b32 s47, s28, 16 +; VI-NEXT: s_lshr_b32 s56, s27, 16 +; VI-NEXT: s_lshr_b32 s57, s26, 16 +; VI-NEXT: s_lshr_b32 s58, s25, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s23, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 16 +; VI-NEXT: s_lshr_b32 s62, s21, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 16 +; VI-NEXT: s_lshr_b32 s72, s19, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s75, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s74, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s73, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s72, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s63, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s62, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s61, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s60, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s59, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s58, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s57, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s56, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s47, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s46, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s28, s45, 16 +; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s28, s44, 16 +; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s43, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_or_b32 s6, s6, s15 +; VI-NEXT: s_or_b32 s7, s7, s14 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s13 +; VI-NEXT: v_mov_b32_e32 v15, s12 +; VI-NEXT: v_mov_b32_e32 v16, s11 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v18, s9 +; VI-NEXT: v_mov_b32_e32 v19, s8 +; VI-NEXT: v_mov_b32_e32 v20, s6 +; VI-NEXT: v_mov_b32_e32 v21, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v22i32_to_v44i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_lshr_b32 s15, s12, 16 +; GFX9-NEXT: s_lshr_b32 s40, s11, 16 +; GFX9-NEXT: s_lshr_b32 s41, s10, 16 +; GFX9-NEXT: s_lshr_b32 s42, s9, 16 +; GFX9-NEXT: s_lshr_b32 s43, s8, 16 +; GFX9-NEXT: s_lshr_b32 s44, s7, 16 +; GFX9-NEXT: s_lshr_b32 s45, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_lshr_b32 s47, s28, 16 +; GFX9-NEXT: s_lshr_b32 s56, s27, 16 +; GFX9-NEXT: s_lshr_b32 s57, s26, 16 +; GFX9-NEXT: s_lshr_b32 s58, s25, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 16 +; GFX9-NEXT: s_lshr_b32 s60, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 16 +; GFX9-NEXT: s_lshr_b32 s62, s21, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 16 +; GFX9-NEXT: s_lshr_b32 s72, s19, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_lshr_b32 s15, s12, 16 +; GFX9-NEXT: s_lshr_b32 s40, s11, 16 +; GFX9-NEXT: s_lshr_b32 s41, s10, 16 +; GFX9-NEXT: s_lshr_b32 s42, s9, 16 +; GFX9-NEXT: s_lshr_b32 s43, s8, 16 +; GFX9-NEXT: s_lshr_b32 s44, s7, 16 +; GFX9-NEXT: s_lshr_b32 s45, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_lshr_b32 s47, s28, 16 +; GFX9-NEXT: s_lshr_b32 s56, s27, 16 +; GFX9-NEXT: s_lshr_b32 s57, s26, 16 +; GFX9-NEXT: s_lshr_b32 s58, s25, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 16 +; GFX9-NEXT: s_lshr_b32 s60, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 16 +; GFX9-NEXT: s_lshr_b32 s62, s21, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 16 +; GFX9-NEXT: s_lshr_b32 s72, s19, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-TRUE16-LABEL: bitcast_v22i32_to_v44i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s26, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s27, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s26, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s27, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v19, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 +; +; GFX11-FAKE16-LABEL: bitcast_v22i32_to_v44i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s62, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s28, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s29, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s6 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB13_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <22 x i32> %a, splat (i32 3) + %a2 = bitcast <22 x i32> %a1 to <44 x i16> + br label %end + +cmp.false: + %a3 = bitcast <22 x i32> %a to <44 x i16> + br label %end + +end: + %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x i16> %phi +} + define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v44i16_to_v22i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v32, v30 -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v57 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v47 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v45 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v44 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v43 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v41 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v59 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v8, v8, v61 -; GCN-NEXT: v_or_b32_e32 v9, v9, v60 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v22 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: .LBB7_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v0, v56, v0 -; GCN-NEXT: v_or_b32_e32 v1, v57, v1 -; GCN-NEXT: v_or_b32_e32 v2, v47, v2 -; GCN-NEXT: v_or_b32_e32 v3, v45, v3 -; GCN-NEXT: v_or_b32_e32 v4, v44, v4 -; GCN-NEXT: v_or_b32_e32 v5, v43, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v41, v7 -; GCN-NEXT: v_or_b32_e32 v8, v61, v8 -; GCN-NEXT: v_or_b32_e32 v9, v60, v9 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v22, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v22, v13 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v22, v14 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v22, v15 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v22, v16 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v22, v17 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v22, v18 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v22, v19 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v22, v20 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v22, v21 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: .LBB7_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44i16_to_v22i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:44 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v45 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: v_or_b32_e32 v2, v2, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_or_b32_e32 v6, v6, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v41 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v59 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v32 +; SI-NEXT: v_or_b32_e32 v12, v12, v63 +; SI-NEXT: v_or_b32_e32 v13, v13, v62 +; SI-NEXT: v_or_b32_e32 v14, v14, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v60 +; SI-NEXT: v_or_b32_e32 v17, v17, v58 +; SI-NEXT: v_or_b32_e32 v18, v18, v57 +; SI-NEXT: v_or_b32_e32 v19, v19, v56 +; SI-NEXT: v_or_b32_e32 v20, v20, v47 +; SI-NEXT: v_or_b32_e32 v21, v21, v46 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v43, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v5, v42, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v41, v7 +; SI-NEXT: v_or_b32_e32 v8, v34, v8 +; SI-NEXT: v_or_b32_e32 v16, v59, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v32, v11 +; SI-NEXT: v_or_b32_e32 v12, v63, v12 +; SI-NEXT: v_or_b32_e32 v13, v62, v13 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: v_or_b32_e32 v15, v60, v15 +; SI-NEXT: v_or_b32_e32 v17, v58, v17 +; SI-NEXT: v_or_b32_e32 v18, v57, v18 +; SI-NEXT: v_or_b32_e32 v19, v56, v19 +; SI-NEXT: v_or_b32_e32 v20, v47, v20 +; SI-NEXT: v_or_b32_e32 v21, v46, v21 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44i16_to_v22i32: ; VI: ; %bb.0: @@ -1977,7 +4330,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v21, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v21, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2046,9 +4399,9 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_4 +; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v21, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v45 @@ -2117,7 +4470,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v22, 3, v32 ; VI-NEXT: v_add_u16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: .LBB7_4: ; %end +; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2207,7 +4560,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; kill: killed $vgpr22 @@ -2300,9 +4653,9 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -2373,7 +4726,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_4: ; %end +; GFX9-NEXT: .LBB14_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2402,7 +4755,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2426,7 +4779,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2482,7 +4835,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2506,7 +4859,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2526,466 +4879,1339 @@ end: ret <22 x i32> %phi } -define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v22i32_to_v44f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v61 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v56 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v46 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v44 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v42 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v40 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v54 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v52 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v50 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_or_b32_e32 v23, v25, v24 -; GCN-NEXT: v_or_b32_e32 v24, v27, v26 -; GCN-NEXT: v_or_b32_e32 v25, v48, v28 -; GCN-NEXT: v_or_b32_e32 v26, v38, v52 -; GCN-NEXT: v_or_b32_e32 v27, v37, v40 -; GCN-NEXT: v_or_b32_e32 v28, v35, v41 -; GCN-NEXT: v_or_b32_e32 v33, v33, v55 -; GCN-NEXT: v_or_b32_e32 v31, v31, v53 -; GCN-NEXT: v_or_b32_e32 v30, v30, v51 -; GCN-NEXT: v_or_b32_e32 v35, v46, v49 -; GCN-NEXT: v_or_b32_e32 v37, v47, v39 -; GCN-NEXT: v_or_b32_e32 v36, v56, v36 -; GCN-NEXT: v_or_b32_e32 v34, v57, v34 -; GCN-NEXT: v_or_b32_e32 v32, v58, v32 -; GCN-NEXT: v_or_b32_e32 v29, v59, v29 -; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44i16_to_v22i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v28 +; SI-NEXT: v_mov_b32_e32 v33, v26 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v8 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v4 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v29 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v9, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v10, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v11, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v20, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v60 +; SI-NEXT: v_or_b32_e32 v21, v0, v55 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB15_2 ; -; VI-LABEL: bitcast_v22i32_to_v44f16: +; VI-LABEL: bitcast_v44i16_to_v22i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v39, v1 +; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v14, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v39 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v37 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v38 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v44i16_to_v22i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v7 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v5 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v36, v3 +; GFX9-NEXT: v_mov_b32_e32 v37, v2 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v39, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v22i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v33, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_mov_b32 v35, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s73 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v35.h +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s43 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v17, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; +; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v22i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <44 x i16> %a, splat (i16 3) + %a2 = bitcast <44 x i16> %a1 to <22 x i32> + br label %end + +cmp.false: + %a3 = bitcast <44 x i16> %a to <22 x i32> + br label %end + +end: + %phi = phi <22 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x i32> %phi +} + +define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v22i32_to_v44f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v22i32_to_v44f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr24 @@ -2993,7 +6219,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -3017,9 +6243,9 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB8_2: ; %Flow +; VI-NEXT: .LBB16_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_4 +; VI-NEXT: s_cbranch_execz .LBB16_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 @@ -3065,7 +6291,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB8_4: ; %end +; VI-NEXT: .LBB16_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 @@ -3141,7 +6367,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -3165,9 +6391,9 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB8_2: ; %Flow +; GFX9-NEXT: .LBB16_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: s_cbranch_execz .LBB16_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 ; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 @@ -3213,7 +6439,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB8_4: ; %end +; GFX9-NEXT: .LBB16_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 @@ -3248,7 +6474,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 @@ -3272,7 +6498,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3305,7 +6531,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -3329,9 +6555,9 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 @@ -3377,7 +6603,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 @@ -3420,465 +6646,1527 @@ end: ret <44 x half> %phi } +define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22i32_to_v44f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_readfirstlane_b32 s8, v5 +; SI-NEXT: v_readfirstlane_b32 s7, v6 +; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v8 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s14, s18, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s20, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_lshr_b32 s42, s22, 16 +; SI-NEXT: s_lshr_b32 s43, s23, 16 +; SI-NEXT: s_lshr_b32 s44, s24, 16 +; SI-NEXT: s_lshr_b32 s45, s25, 16 +; SI-NEXT: s_lshr_b32 s46, s26, 16 +; SI-NEXT: s_lshr_b32 s47, s27, 16 +; SI-NEXT: s_lshr_b32 s56, s28, 16 +; SI-NEXT: s_lshr_b32 s57, s29, 16 +; SI-NEXT: s_lshr_b32 s58, s13, 16 +; SI-NEXT: s_lshr_b32 s59, s12, 16 +; SI-NEXT: s_lshr_b32 s60, s11, 16 +; SI-NEXT: s_lshr_b32 s61, s10, 16 +; SI-NEXT: s_lshr_b32 s62, s8, 16 +; SI-NEXT: s_lshr_b32 s63, s7, 16 +; SI-NEXT: s_lshr_b32 s72, s6, 16 +; SI-NEXT: s_lshr_b32 s73, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v39, v39, v48 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v22i32_to_v44f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v0 +; VI-NEXT: v_readfirstlane_b32 s12, v1 +; VI-NEXT: v_readfirstlane_b32 s11, v2 +; VI-NEXT: v_readfirstlane_b32 s10, v3 +; VI-NEXT: v_readfirstlane_b32 s9, v4 +; VI-NEXT: v_readfirstlane_b32 s8, v5 +; VI-NEXT: v_readfirstlane_b32 s6, v6 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v7 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s6, 16 +; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshr_b32 s42, s10, 16 +; VI-NEXT: s_lshr_b32 s43, s11, 16 +; VI-NEXT: s_lshr_b32 s44, s12, 16 +; VI-NEXT: s_lshr_b32 s45, s13, 16 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_lshr_b32 s47, s28, 16 +; VI-NEXT: s_lshr_b32 s56, s27, 16 +; VI-NEXT: s_lshr_b32 s57, s26, 16 +; VI-NEXT: s_lshr_b32 s58, s25, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s23, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 16 +; VI-NEXT: s_lshr_b32 s62, s21, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 16 +; VI-NEXT: s_lshr_b32 s72, s19, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s6, 16 +; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshr_b32 s42, s10, 16 +; VI-NEXT: s_lshr_b32 s43, s11, 16 +; VI-NEXT: s_lshr_b32 s44, s12, 16 +; VI-NEXT: s_lshr_b32 s45, s13, 16 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_lshr_b32 s47, s28, 16 +; VI-NEXT: s_lshr_b32 s56, s27, 16 +; VI-NEXT: s_lshr_b32 s57, s26, 16 +; VI-NEXT: s_lshr_b32 s58, s25, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s23, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 16 +; VI-NEXT: s_lshr_b32 s62, s21, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 16 +; VI-NEXT: s_lshr_b32 s72, s19, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s75, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s74, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s73, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s72, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s63, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s62, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s61, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s60, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s59, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s58, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s57, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s56, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s47, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s46, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s28, s45, 16 +; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s28, s44, 16 +; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s43, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_or_b32 s6, s6, s15 +; VI-NEXT: s_or_b32 s7, s7, s14 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s13 +; VI-NEXT: v_mov_b32_e32 v15, s12 +; VI-NEXT: v_mov_b32_e32 v16, s11 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v18, s9 +; VI-NEXT: v_mov_b32_e32 v19, s8 +; VI-NEXT: v_mov_b32_e32 v20, s6 +; VI-NEXT: v_mov_b32_e32 v21, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v22i32_to_v44f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_lshr_b32 s15, s12, 16 +; GFX9-NEXT: s_lshr_b32 s40, s11, 16 +; GFX9-NEXT: s_lshr_b32 s41, s10, 16 +; GFX9-NEXT: s_lshr_b32 s42, s9, 16 +; GFX9-NEXT: s_lshr_b32 s43, s8, 16 +; GFX9-NEXT: s_lshr_b32 s44, s7, 16 +; GFX9-NEXT: s_lshr_b32 s45, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_lshr_b32 s47, s28, 16 +; GFX9-NEXT: s_lshr_b32 s56, s27, 16 +; GFX9-NEXT: s_lshr_b32 s57, s26, 16 +; GFX9-NEXT: s_lshr_b32 s58, s25, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 16 +; GFX9-NEXT: s_lshr_b32 s60, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 16 +; GFX9-NEXT: s_lshr_b32 s62, s21, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 16 +; GFX9-NEXT: s_lshr_b32 s72, s19, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_lshr_b32 s15, s12, 16 +; GFX9-NEXT: s_lshr_b32 s40, s11, 16 +; GFX9-NEXT: s_lshr_b32 s41, s10, 16 +; GFX9-NEXT: s_lshr_b32 s42, s9, 16 +; GFX9-NEXT: s_lshr_b32 s43, s8, 16 +; GFX9-NEXT: s_lshr_b32 s44, s7, 16 +; GFX9-NEXT: s_lshr_b32 s45, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_lshr_b32 s47, s28, 16 +; GFX9-NEXT: s_lshr_b32 s56, s27, 16 +; GFX9-NEXT: s_lshr_b32 s57, s26, 16 +; GFX9-NEXT: s_lshr_b32 s58, s25, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 16 +; GFX9-NEXT: s_lshr_b32 s60, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 16 +; GFX9-NEXT: s_lshr_b32 s62, s21, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 16 +; GFX9-NEXT: s_lshr_b32 s72, s19, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-TRUE16-LABEL: bitcast_v22i32_to_v44f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: .LBB17_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s26, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s27, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s26, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s27, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v19, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB17_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB17_2 +; +; GFX11-FAKE16-LABEL: bitcast_v22i32_to_v44f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s62, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-FAKE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: .LBB17_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s28, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s29, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s6 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB17_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <22 x i32> %a, splat (i32 3) + %a2 = bitcast <22 x i32> %a1 to <44 x half> + br label %end + +cmp.false: + %a3 = bitcast <22 x i32> %a to <44 x half> + br label %end + +end: + %phi = phi <44 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x half> %phi +} + define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v44f16_to_v22i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v19 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; GCN-NEXT: v_or_b32_e32 v0, v50, v0 -; GCN-NEXT: v_or_b32_e32 v1, v48, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v34, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; GCN-NEXT: v_or_b32_e32 v6, v62, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; GCN-NEXT: v_or_b32_e32 v7, v60, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v54 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v22, v8 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v22, v9 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v22, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v22, v13 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v22, v14 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v22, v15 -; GCN-NEXT: v_or_b32_e32 v16, v45, v16 -; GCN-NEXT: v_or_b32_e32 v17, v43, v17 -; GCN-NEXT: v_or_b32_e32 v18, v41, v18 -; GCN-NEXT: v_or_b32_e32 v19, v55, v19 -; GCN-NEXT: v_or_b32_e32 v20, v53, v20 -; GCN-NEXT: v_or_b32_e32 v21, v52, v21 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v48 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v38 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v36 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v32 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v62 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v60 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v59 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v58 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v57 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v56 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v52 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_or_b32_e32 v9, v11, v10 -; GCN-NEXT: v_or_b32_e32 v10, v13, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_or_b32_e32 v12, v17, v16 -; GCN-NEXT: v_or_b32_e32 v13, v19, v18 -; GCN-NEXT: v_or_b32_e32 v14, v21, v20 -; GCN-NEXT: v_or_b32_e32 v15, v23, v22 -; GCN-NEXT: v_or_b32_e32 v16, v25, v24 -; GCN-NEXT: v_or_b32_e32 v17, v27, v26 -; GCN-NEXT: v_or_b32_e32 v18, v29, v28 -; GCN-NEXT: v_or_b32_e32 v19, v31, v30 -; GCN-NEXT: v_or_b32_e32 v20, v33, v32 -; GCN-NEXT: v_or_b32_e32 v21, v35, v34 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44f16_to_v22i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_or_b32_e32 v20, v54, v20 +; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_or_b32_e32 v19, v40, v19 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v41 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v54 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44f16_to_v22i32: ; VI: ; %bb.0: @@ -3915,7 +8203,7 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v21, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v21, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -3984,9 +8272,9 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB9_2: ; %Flow +; VI-NEXT: .LBB18_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_4 +; VI-NEXT: s_cbranch_execz .LBB18_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v21, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v45, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -4055,7 +8343,7 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: .LBB9_4: ; %end +; VI-NEXT: .LBB18_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -4145,7 +8433,7 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; kill: killed $vgpr22 @@ -4238,9 +8526,9 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: .LBB9_2: ; %Flow +; GFX9-NEXT: .LBB18_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -4312,7 +8600,7 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_4: ; %end +; GFX9-NEXT: .LBB18_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -4341,7 +8629,7 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -4365,7 +8653,7 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: .LBB18_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4421,7 +8709,7 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -4445,7 +8733,7 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: .LBB18_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4465,41 +8753,1050 @@ end: ret <22 x i32> %phi } +define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44f16_to_v22i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v5, v28, v5 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v50, v10 +; SI-NEXT: v_or_b32_e32 v11, v59, v11 +; SI-NEXT: v_or_b32_e32 v12, v57, v12 +; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v43, v15 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v55, v17 +; SI-NEXT: v_or_b32_e32 v18, v53, v18 +; SI-NEXT: v_or_b32_e32 v19, v31, v19 +; SI-NEXT: v_or_b32_e32 v20, v24, v20 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v40 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: v_mov_b32_e32 v41, v52 +; SI-NEXT: v_mov_b32_e32 v52, v23 +; SI-NEXT: v_mov_b32_e32 v48, v60 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v42 +; SI-NEXT: v_mov_b32_e32 v42, v53 +; SI-NEXT: v_mov_b32_e32 v53, v22 +; SI-NEXT: v_mov_b32_e32 v35, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v43 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v54, v24 +; SI-NEXT: v_mov_b32_e32 v50, v34 +; SI-NEXT: v_mov_b32_e32 v34, v62 +; SI-NEXT: v_mov_b32_e32 v62, v57 +; SI-NEXT: v_mov_b32_e32 v57, v44 +; SI-NEXT: v_mov_b32_e32 v44, v55 +; SI-NEXT: v_mov_b32_e32 v55, v25 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v45, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v38, v27 +; SI-NEXT: v_mov_b32_e32 v37, v28 +; SI-NEXT: v_mov_b32_e32 v49, v36 +; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v28, v37 +; SI-NEXT: v_mov_b32_e32 v27, v38 +; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v45 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v63, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v25, v55 +; SI-NEXT: v_mov_b32_e32 v55, v44 +; SI-NEXT: v_mov_b32_e32 v44, v57 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v62, v34 +; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v24, v54 +; SI-NEXT: v_mov_b32_e32 v54, v43 +; SI-NEXT: v_mov_b32_e32 v43, v56 +; SI-NEXT: v_mov_b32_e32 v56, v61 +; SI-NEXT: v_mov_b32_e32 v61, v35 +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v60, v48 +; SI-NEXT: v_mov_b32_e32 v23, v52 +; SI-NEXT: v_mov_b32_e32 v52, v41 +; SI-NEXT: v_mov_b32_e32 v41, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v44f16_to_v22i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v32, v7 +; VI-NEXT: v_mov_b32_e32 v33, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v21, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v39, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v38, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v37, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v36, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v35, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v33, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v20, v22, v20 +; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v44f16_to_v22i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v7 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v5 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v36, v3 +; GFX9-NEXT: v_mov_b32_e32 v37, v2 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v39, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v22i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v33, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_mov_b32 v35, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s73 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v35.h +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s43 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v17, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB19_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB19_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB19_2 +; +; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v22i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB19_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB19_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB19_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <44 x half> %a, splat (half 0xH0200) + %a2 = bitcast <44 x half> %a1 to <22 x i32> + br label %end + +cmp.false: + %a3 = bitcast <44 x half> %a to <22 x i32> + br label %end + +end: + %phi = phi <22 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x i32> %phi +} + define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v22f32_to_v11i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22f32_to_v11i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f32_to_v11i64: ; VI: ; %bb.0: @@ -4508,7 +9805,7 @@ define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -4532,7 +9829,7 @@ define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4543,7 +9840,7 @@ define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 ; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -4567,11 +9864,392 @@ define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v22f32_to_v11i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <22 x float> %a1 to <11 x i64> + br label %end + +cmp.false: + %a3 = bitcast <22 x float> %a to <11 x i64> + br label %end + +end: + %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i64> %phi +} + +define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22f32_to_v11i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v22f32_to_v11i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v22f32_to_v11i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v22f32_to_v11i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB21_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: .LBB21_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <22 x float> %a1 to <11 x i64> + br label %end + +cmp.false: + %a3 = bitcast <22 x float> %a to <11 x i64> + br label %end + +end: + %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i64> %phi +} + +define <22 x float> @bitcast_v11i64_to_v22f32(<11 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v11i64_to_v22f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v11i64_to_v22f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v11i64_to_v22f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v22f32_to_v11i64: +; GFX11-LABEL: bitcast_v11i64_to_v22f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -4579,84 +10257,148 @@ define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <22 x float> %a1 to <11 x i64> + %a1 = add <11 x i64> %a, splat (i64 3) + %a2 = bitcast <11 x i64> %a1 to <22 x float> br label %end cmp.false: - %a3 = bitcast <22 x float> %a to <11 x i64> + %a3 = bitcast <11 x i64> %a to <22 x float> br label %end end: - %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <11 x i64> %phi + %phi = phi <22 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x float> %phi } -define <22 x float> @bitcast_v11i64_to_v22f32(<11 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v11i64_to_v22f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <22 x float> @bitcast_v11i64_to_v22f32_scalar(<11 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11i64_to_v22f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 ; -; VI-LABEL: bitcast_v11i64_to_v22f32: +; VI-LABEL: bitcast_v11i64_to_v22f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 ; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 @@ -4679,19 +10421,43 @@ define <22 x float> @bitcast_v11i64_to_v22f32(<11 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB11_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB23_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 ; -; GFX9-LABEL: bitcast_v11i64_to_v22f32: +; GFX9-LABEL: bitcast_v11i64_to_v22f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 @@ -4714,20 +10480,38 @@ define <22 x float> @bitcast_v11i64_to_v22f32(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB11_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB23_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 ; -; GFX11-LABEL: bitcast_v11i64_to_v22f32: +; GFX11-LABEL: bitcast_v11i64_to_v22f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB23_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: .LBB23_4: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo @@ -4756,8 +10540,6 @@ define <22 x float> @bitcast_v11i64_to_v22f32(<11 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB11_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4777,40 +10559,40 @@ end: } define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v22f32_to_v11f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22f32_to_v11f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f32_to_v11f64: ; VI: ; %bb.0: @@ -4819,7 +10601,7 @@ define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -4843,7 +10625,7 @@ define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end +; VI-NEXT: .LBB24_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4854,7 +10636,7 @@ define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 ; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -4878,7 +10660,7 @@ define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: .LBB24_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4890,7 +10672,7 @@ define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 ; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 @@ -4903,60 +10685,455 @@ define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB12_2: ; %end +; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <22 x float> %a1 to <11 x double> + br label %end + +cmp.false: + %a3 = bitcast <22 x float> %a to <11 x double> + br label %end + +end: + %phi = phi <11 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x double> %phi +} + +define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22f32_to_v11f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: s_branch .LBB25_2 +; +; VI-LABEL: bitcast_v22f32_to_v11f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v22f32_to_v11f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-LABEL: bitcast_v22f32_to_v11f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB25_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <22 x float> %a1 to <11 x double> + br label %end + +cmp.false: + %a3 = bitcast <22 x float> %a to <11 x double> + br label %end + +end: + %phi = phi <11 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x double> %phi +} + +define <22 x float> @bitcast_v11f64_to_v22f32(<11 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v11f64_to_v22f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v11f64_to_v22f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v11f64_to_v22f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v11f64_to_v22f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <22 x float> %a1 to <11 x double> + %a1 = fadd <11 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <11 x double> %a1 to <22 x float> br label %end cmp.false: - %a3 = bitcast <22 x float> %a to <11 x double> + %a3 = bitcast <11 x double> %a to <22 x float> br label %end end: - %phi = phi <11 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <11 x double> %phi + %phi = phi <22 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x float> %phi } -define <22 x float> @bitcast_v11f64_to_v22f32(<11 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v11f64_to_v22f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11f64_to_v22f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_branch .LBB27_2 ; -; VI-LABEL: bitcast_v11f64_to_v22f32: +; VI-LABEL: bitcast_v11f64_to_v22f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v10, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -4968,19 +11145,43 @@ define <22 x float> @bitcast_v11f64_to_v22f32(<11 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB27_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 ; -; GFX9-LABEL: bitcast_v11f64_to_v22f32: +; GFX9-LABEL: bitcast_v11f64_to_v22f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -4992,20 +11193,38 @@ define <22 x float> @bitcast_v11f64_to_v22f32(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB27_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 ; -; GFX11-LABEL: bitcast_v11f64_to_v22f32: +; GFX11-LABEL: bitcast_v11f64_to_v22f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB27_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: .LBB27_4: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -5017,8 +11236,6 @@ define <22 x float> @bitcast_v11f64_to_v22f32(<11 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -5038,219 +11255,240 @@ end: } define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v22f32_to_v44i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v37, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v37, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v1, v1, v49 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v52 -; GCN-NEXT: v_or_b32_e32 v2, v2, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v3, v3, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v5, v5, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v7, v7, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v8, v8, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v9, v9, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v10, v10, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v11, v11, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v12, v12, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v13, v13, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v14, v14, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v15, v15, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v16, v16, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v17, v17, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v20, v20, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v21, v21, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v22, v22, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x54, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v27, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22f32_to_v44i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f32_to_v44i16: ; VI: ; %bb.0: @@ -5280,7 +11518,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -5304,9 +11542,9 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB14_2: ; %Flow +; VI-NEXT: .LBB28_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_4 +; VI-NEXT: s_cbranch_execz .LBB28_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -5352,7 +11590,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB14_4: ; %end +; VI-NEXT: .LBB28_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 @@ -5428,7 +11666,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -5452,9 +11690,9 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB14_2: ; %Flow +; GFX9-NEXT: .LBB28_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: s_cbranch_execz .LBB28_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 ; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -5500,7 +11738,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB14_4: ; %end +; GFX9-NEXT: .LBB28_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 @@ -5535,7 +11773,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 @@ -5548,7 +11786,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5581,7 +11819,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -5605,9 +11843,9 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB28_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 @@ -5642,7 +11880,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: .LBB28_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 @@ -5685,372 +11923,1265 @@ end: ret <44 x i16> %phi } +define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22f32_to_v44i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_mov_b32_e32 v23, s16 +; SI-NEXT: v_mov_b32_e32 v22, s17 +; SI-NEXT: v_mov_b32_e32 v21, s18 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_mov_b32_e32 v9, s29 +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v20, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v24, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v25, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v26, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v10, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v13, 16 +; SI-NEXT: v_alignbit_b32 v32, v12, v14, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v17, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v19, v21, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v20, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v24, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v25, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v26, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v10, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v13, 16 +; SI-NEXT: v_alignbit_b32 v32, v12, v14, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v17, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v19, v21, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_or_b32_e32 v23, v23, v49 +; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v39 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v12, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_v22f32_to_v44i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v21, s17 +; VI-NEXT: v_mov_b32_e32 v20, s18 +; VI-NEXT: v_mov_b32_e32 v19, s19 +; VI-NEXT: v_mov_b32_e32 v18, s20 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v16, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB29_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v24, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_sdwa v26, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; VI-NEXT: v_or_b32_sdwa v8, v11, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v50 +; VI-NEXT: v_or_b32_sdwa v27, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 +; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v49 +; VI-NEXT: v_or_b32_sdwa v28, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; VI-NEXT: v_or_b32_sdwa v29, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 +; VI-NEXT: v_or_b32_sdwa v11, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; VI-NEXT: v_or_b32_sdwa v22, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v23, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v22 +; VI-NEXT: v_mov_b32_e32 v7, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_branch .LBB29_2 +; +; GFX9-LABEL: bitcast_v22f32_to_v44i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_mov_b32_e32 v22, s16 +; GFX9-NEXT: v_mov_b32_e32 v21, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v16, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 +; GFX9-NEXT: v_mov_b32_e32 v11, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v14, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v15, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v34, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v20, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v9, v50, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v49, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v48, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v38, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v21, v30, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v22 +; GFX9-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: s_branch .LBB29_2 +; +; GFX11-TRUE16-LABEL: bitcast_v22f32_to_v44i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s24 :: v_dual_mov_b32 v15, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s27 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-TRUE16-NEXT: .LBB29_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v36, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v50, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v49, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v39, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v37, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v35, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v31, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v33, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v32, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB29_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB29_2 +; +; GFX11-FAKE16-LABEL: bitcast_v22f32_to_v44i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v14, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v12, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v13, 1.0, v13 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: .LBB29_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v36, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v49, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v37, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB29_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: s_branch .LBB29_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <22 x float> %a1 to <44 x i16> + br label %end + +cmp.false: + %a3 = bitcast <22 x float> %a to <44 x i16> + br label %end + +end: + %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x i16> %phi +} + define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v44i16_to_v22f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v32, v30 -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v57 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v47 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v45 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v44 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v43 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v41 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v59 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v8, v8, v61 -; GCN-NEXT: v_or_b32_e32 v9, v9, v60 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v22 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: .LBB15_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v0, v56, v0 -; GCN-NEXT: v_or_b32_e32 v1, v57, v1 -; GCN-NEXT: v_or_b32_e32 v2, v47, v2 -; GCN-NEXT: v_or_b32_e32 v3, v45, v3 -; GCN-NEXT: v_or_b32_e32 v4, v44, v4 -; GCN-NEXT: v_or_b32_e32 v5, v43, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v41, v7 -; GCN-NEXT: v_or_b32_e32 v8, v61, v8 -; GCN-NEXT: v_or_b32_e32 v9, v60, v9 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v22, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v22, v13 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v22, v14 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v22, v15 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v22, v16 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v22, v17 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v22, v18 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v22, v19 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v22, v20 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v22, v21 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: .LBB15_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44i16_to_v22f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:44 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v45 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: v_or_b32_e32 v2, v2, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_or_b32_e32 v6, v6, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v41 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v59 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v32 +; SI-NEXT: v_or_b32_e32 v12, v12, v63 +; SI-NEXT: v_or_b32_e32 v13, v13, v62 +; SI-NEXT: v_or_b32_e32 v14, v14, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v60 +; SI-NEXT: v_or_b32_e32 v17, v17, v58 +; SI-NEXT: v_or_b32_e32 v18, v18, v57 +; SI-NEXT: v_or_b32_e32 v19, v19, v56 +; SI-NEXT: v_or_b32_e32 v20, v20, v47 +; SI-NEXT: v_or_b32_e32 v21, v21, v46 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: .LBB30_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v43, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v5, v42, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v41, v7 +; SI-NEXT: v_or_b32_e32 v8, v34, v8 +; SI-NEXT: v_or_b32_e32 v16, v59, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v32, v11 +; SI-NEXT: v_or_b32_e32 v12, v63, v12 +; SI-NEXT: v_or_b32_e32 v13, v62, v13 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: v_or_b32_e32 v15, v60, v15 +; SI-NEXT: v_or_b32_e32 v17, v58, v17 +; SI-NEXT: v_or_b32_e32 v18, v57, v18 +; SI-NEXT: v_or_b32_e32 v19, v56, v19 +; SI-NEXT: v_or_b32_e32 v20, v47, v20 +; SI-NEXT: v_or_b32_e32 v21, v46, v21 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: .LBB30_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44i16_to_v22f32: ; VI: ; %bb.0: @@ -6087,7 +13218,7 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v21, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v21, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -6156,9 +13287,9 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB15_2: ; %Flow +; VI-NEXT: .LBB30_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_4 +; VI-NEXT: s_cbranch_execz .LBB30_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v21, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v45 @@ -6227,7 +13358,7 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v22, 3, v32 ; VI-NEXT: v_add_u16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: .LBB15_4: ; %end +; VI-NEXT: .LBB30_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -6317,7 +13448,7 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; kill: killed $vgpr22 @@ -6410,9 +13541,9 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: .LBB15_2: ; %Flow +; GFX9-NEXT: .LBB30_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -6483,7 +13614,7 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_4: ; %end +; GFX9-NEXT: .LBB30_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -6512,7 +13643,7 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -6536,7 +13667,7 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB15_2: ; %end +; GFX11-TRUE16-NEXT: .LBB30_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6592,7 +13723,7 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -6616,9 +13747,879 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB15_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB30_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <44 x i16> %a, splat (i16 3) + %a2 = bitcast <44 x i16> %a1 to <22 x float> + br label %end + +cmp.false: + %a3 = bitcast <44 x i16> %a to <22 x float> + br label %end + +end: + %phi = phi <22 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x float> %phi +} + +define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44i16_to_v22f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v28 +; SI-NEXT: v_mov_b32_e32 v33, v26 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v8 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v4 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v29 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v9, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v10, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v11, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v20, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v60 +; SI-NEXT: v_or_b32_e32 v21, v0, v55 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v44i16_to_v22f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v39, v1 +; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v14, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v39 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v37 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v38 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v44i16_to_v22f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v7 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v5 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v36, v3 +; GFX9-NEXT: v_mov_b32_e32 v37, v2 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v39, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v22f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v33, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_mov_b32 v35, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s73 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v35.h +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s43 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v17, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB31_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB31_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB31_2 +; +; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v22f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB31_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB31_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB31_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6637,443 +14638,446 @@ end: } define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v22f32_to_v44f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: .LBB16_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: .LBB16_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v61 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v56 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v46 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v44 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v42 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v40 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v54 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v52 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v50 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_or_b32_e32 v23, v25, v24 -; GCN-NEXT: v_or_b32_e32 v24, v27, v26 -; GCN-NEXT: v_or_b32_e32 v25, v48, v28 -; GCN-NEXT: v_or_b32_e32 v26, v38, v52 -; GCN-NEXT: v_or_b32_e32 v27, v37, v40 -; GCN-NEXT: v_or_b32_e32 v28, v35, v41 -; GCN-NEXT: v_or_b32_e32 v33, v33, v55 -; GCN-NEXT: v_or_b32_e32 v31, v31, v53 -; GCN-NEXT: v_or_b32_e32 v30, v30, v51 -; GCN-NEXT: v_or_b32_e32 v35, v46, v49 -; GCN-NEXT: v_or_b32_e32 v37, v47, v39 -; GCN-NEXT: v_or_b32_e32 v36, v56, v36 -; GCN-NEXT: v_or_b32_e32 v34, v57, v34 -; GCN-NEXT: v_or_b32_e32 v32, v58, v32 -; GCN-NEXT: v_or_b32_e32 v29, v59, v29 -; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22f32_to_v44f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: .LBB32_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f32_to_v44f16: ; VI: ; %bb.0: @@ -7103,7 +15107,7 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -7127,9 +15131,9 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB16_2: ; %Flow +; VI-NEXT: .LBB32_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_4 +; VI-NEXT: s_cbranch_execz .LBB32_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -7175,7 +15179,7 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB16_4: ; %end +; VI-NEXT: .LBB32_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 @@ -7251,7 +15255,7 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -7275,9 +15279,9 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB16_2: ; %Flow +; GFX9-NEXT: .LBB32_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: s_cbranch_execz .LBB32_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 ; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -7323,7 +15327,7 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB16_4: ; %end +; GFX9-NEXT: .LBB32_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 @@ -7358,7 +15362,7 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 @@ -7371,15 +15375,1157 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: .LBB32_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v22f32_to_v44f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <22 x float> %a1 to <44 x half> + br label %end + +cmp.false: + %a3 = bitcast <22 x float> %a to <44 x half> + br label %end + +end: + %phi = phi <44 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x half> %phi +} + +define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22f32_to_v44f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_readfirstlane_b32 s8, v5 +; SI-NEXT: v_readfirstlane_b32 s7, v6 +; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v8 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s10, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s9, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v39, v39, v48 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v22f32_to_v44f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v21, s17 +; VI-NEXT: v_mov_b32_e32 v20, s18 +; VI-NEXT: v_mov_b32_e32 v19, s19 +; VI-NEXT: v_mov_b32_e32 v18, s20 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v16, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB33_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v24, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_sdwa v26, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; VI-NEXT: v_or_b32_sdwa v8, v11, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v50 +; VI-NEXT: v_or_b32_sdwa v27, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 +; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v49 +; VI-NEXT: v_or_b32_sdwa v28, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; VI-NEXT: v_or_b32_sdwa v29, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 +; VI-NEXT: v_or_b32_sdwa v11, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; VI-NEXT: v_or_b32_sdwa v22, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v23, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v22 +; VI-NEXT: v_mov_b32_e32 v7, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_branch .LBB33_2 +; +; GFX9-LABEL: bitcast_v22f32_to_v44f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_mov_b32_e32 v22, s16 +; GFX9-NEXT: v_mov_b32_e32 v21, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v16, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 +; GFX9-NEXT: v_mov_b32_e32 v11, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v14, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v15, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v34, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v20, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v9, v50, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v49, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v48, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v38, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v21, v30, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v22 +; GFX9-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: s_branch .LBB33_2 +; +; GFX11-TRUE16-LABEL: bitcast_v22f32_to_v44f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s24 :: v_dual_mov_b32 v15, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s27 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-TRUE16-NEXT: .LBB33_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v36, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v50, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v49, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v39, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v37, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v35, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v31, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v33, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v32, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB33_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB33_2 +; +; GFX11-FAKE16-LABEL: bitcast_v22f32_to_v44f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v14, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v12, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v13, 1.0, v13 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: .LBB33_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v36, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v49, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v37, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB33_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 @@ -7397,100 +16543,7 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: s_branch .LBB33_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7509,464 +16562,499 @@ end: } define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v44f16_to_v22f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v19 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; GCN-NEXT: v_or_b32_e32 v0, v50, v0 -; GCN-NEXT: v_or_b32_e32 v1, v48, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v34, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; GCN-NEXT: v_or_b32_e32 v6, v62, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; GCN-NEXT: v_or_b32_e32 v7, v60, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v54 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v22, v8 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v22, v9 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v22, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v22, v13 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v22, v14 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v22, v15 -; GCN-NEXT: v_or_b32_e32 v16, v45, v16 -; GCN-NEXT: v_or_b32_e32 v17, v43, v17 -; GCN-NEXT: v_or_b32_e32 v18, v41, v18 -; GCN-NEXT: v_or_b32_e32 v19, v55, v19 -; GCN-NEXT: v_or_b32_e32 v20, v53, v20 -; GCN-NEXT: v_or_b32_e32 v21, v52, v21 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: .LBB17_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v48 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v38 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v36 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v32 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v62 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v60 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v59 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v58 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v57 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v56 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v52 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_or_b32_e32 v9, v11, v10 -; GCN-NEXT: v_or_b32_e32 v10, v13, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_or_b32_e32 v12, v17, v16 -; GCN-NEXT: v_or_b32_e32 v13, v19, v18 -; GCN-NEXT: v_or_b32_e32 v14, v21, v20 -; GCN-NEXT: v_or_b32_e32 v15, v23, v22 -; GCN-NEXT: v_or_b32_e32 v16, v25, v24 -; GCN-NEXT: v_or_b32_e32 v17, v27, v26 -; GCN-NEXT: v_or_b32_e32 v18, v29, v28 -; GCN-NEXT: v_or_b32_e32 v19, v31, v30 -; GCN-NEXT: v_or_b32_e32 v20, v33, v32 -; GCN-NEXT: v_or_b32_e32 v21, v35, v34 -; GCN-NEXT: .LBB17_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44f16_to_v22f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_or_b32_e32 v20, v54, v20 +; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_or_b32_e32 v19, v40, v19 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v41 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v54 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44f16_to_v22f32: ; VI: ; %bb.0: @@ -8003,7 +17091,7 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v21, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v21, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -8072,9 +17160,9 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB17_2: ; %Flow +; VI-NEXT: .LBB34_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_4 +; VI-NEXT: s_cbranch_execz .LBB34_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v21, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v45, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -8143,7 +17231,7 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: .LBB17_4: ; %end +; VI-NEXT: .LBB34_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -8233,7 +17321,7 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; kill: killed $vgpr22 @@ -8326,9 +17414,9 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: .LBB17_2: ; %Flow +; GFX9-NEXT: .LBB34_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_4 +; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -8400,7 +17488,7 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_4: ; %end +; GFX9-NEXT: .LBB34_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -8429,7 +17517,7 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -8453,7 +17541,7 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8509,7 +17597,7 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -8533,71 +17621,1291 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <44 x half> %a, splat (half 0xH0200) - %a2 = bitcast <44 x half> %a1 to <22 x float> + %a1 = fadd <44 x half> %a, splat (half 0xH0200) + %a2 = bitcast <44 x half> %a1 to <22 x float> + br label %end + +cmp.false: + %a3 = bitcast <44 x half> %a to <22 x float> + br label %end + +end: + %phi = phi <22 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x float> %phi +} + +define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44f16_to_v22f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v5, v28, v5 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v50, v10 +; SI-NEXT: v_or_b32_e32 v11, v59, v11 +; SI-NEXT: v_or_b32_e32 v12, v57, v12 +; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v43, v15 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v55, v17 +; SI-NEXT: v_or_b32_e32 v18, v53, v18 +; SI-NEXT: v_or_b32_e32 v19, v31, v19 +; SI-NEXT: v_or_b32_e32 v20, v24, v20 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v40 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: v_mov_b32_e32 v41, v52 +; SI-NEXT: v_mov_b32_e32 v52, v23 +; SI-NEXT: v_mov_b32_e32 v48, v60 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v42 +; SI-NEXT: v_mov_b32_e32 v42, v53 +; SI-NEXT: v_mov_b32_e32 v53, v22 +; SI-NEXT: v_mov_b32_e32 v35, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v43 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v54, v24 +; SI-NEXT: v_mov_b32_e32 v50, v34 +; SI-NEXT: v_mov_b32_e32 v34, v62 +; SI-NEXT: v_mov_b32_e32 v62, v57 +; SI-NEXT: v_mov_b32_e32 v57, v44 +; SI-NEXT: v_mov_b32_e32 v44, v55 +; SI-NEXT: v_mov_b32_e32 v55, v25 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v45, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v38, v27 +; SI-NEXT: v_mov_b32_e32 v37, v28 +; SI-NEXT: v_mov_b32_e32 v49, v36 +; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v28, v37 +; SI-NEXT: v_mov_b32_e32 v27, v38 +; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v45 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v63, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v25, v55 +; SI-NEXT: v_mov_b32_e32 v55, v44 +; SI-NEXT: v_mov_b32_e32 v44, v57 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v62, v34 +; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v24, v54 +; SI-NEXT: v_mov_b32_e32 v54, v43 +; SI-NEXT: v_mov_b32_e32 v43, v56 +; SI-NEXT: v_mov_b32_e32 v56, v61 +; SI-NEXT: v_mov_b32_e32 v61, v35 +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v60, v48 +; SI-NEXT: v_mov_b32_e32 v23, v52 +; SI-NEXT: v_mov_b32_e32 v52, v41 +; SI-NEXT: v_mov_b32_e32 v41, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v44f16_to_v22f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v32, v7 +; VI-NEXT: v_mov_b32_e32 v33, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v21, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v39, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v38, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v37, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v36, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v35, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v33, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v20, v22, v20 +; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v44f16_to_v22f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v7 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v5 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v36, v3 +; GFX9-NEXT: v_mov_b32_e32 v37, v2 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v39, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v22f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v33, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_mov_b32 v35, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s73 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v35.h +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s43 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v17, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB35_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB35_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; +; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v22f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB35_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB35_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB35_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <44 x half> %a, splat (half 0xH0200) + %a2 = bitcast <44 x half> %a1 to <22 x float> + br label %end + +cmp.false: + %a3 = bitcast <44 x half> %a to <22 x float> + br label %end + +end: + %phi = phi <22 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x float> %phi +} + +define <11 x double> @bitcast_v11i64_to_v11f64(<11 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v11i64_to_v11f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v11i64_to_v11f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v11i64_to_v11f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v11i64_to_v11f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <11 x i64> %a, splat (i64 3) + %a2 = bitcast <11 x i64> %a1 to <11 x double> br label %end cmp.false: - %a3 = bitcast <44 x half> %a to <22 x float> + %a3 = bitcast <11 x i64> %a to <11 x double> br label %end end: - %phi = phi <22 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <22 x float> %phi + %phi = phi <11 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x double> %phi } -define <11 x double> @bitcast_v11i64_to_v11f64(<11 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v11i64_to_v11f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <11 x double> @bitcast_v11i64_to_v11f64_scalar(<11 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11i64_to_v11f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 ; -; VI-LABEL: bitcast_v11i64_to_v11f64: +; VI-LABEL: bitcast_v11i64_to_v11f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 @@ -8620,19 +18928,43 @@ define <11 x double> @bitcast_v11i64_to_v11f64(<11 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 ; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: .LBB18_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB37_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 ; -; GFX9-LABEL: bitcast_v11i64_to_v11f64: +; GFX9-LABEL: bitcast_v11i64_to_v11f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 @@ -8655,20 +18987,38 @@ define <11 x double> @bitcast_v11i64_to_v11f64(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: .LBB18_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB37_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 ; -; GFX11-LABEL: bitcast_v11i64_to_v11f64: +; GFX11-LABEL: bitcast_v11i64_to_v11f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB37_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: .LBB37_4: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -8697,8 +19047,6 @@ define <11 x double> @bitcast_v11i64_to_v11f64(<11 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: .LBB18_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -8718,29 +19066,29 @@ end: } define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v11f64_to_v11i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11f64_to_v11i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f64_to_v11i64: ; VI: ; %bb.0: @@ -8749,7 +19097,7 @@ define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -8762,7 +19110,7 @@ define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8773,7 +19121,7 @@ define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -8786,7 +19134,7 @@ define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end +; GFX9-NEXT: .LBB38_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8798,7 +19146,7 @@ define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -8811,7 +19159,7 @@ define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8831,220 +19179,442 @@ end: ret <11 x i64> %phi } +define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11f64_to_v11i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v11f64_to_v11i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v10, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v11f64_to_v11i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v11f64_to_v11i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB39_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: .LBB39_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <11 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <11 x double> %a1 to <11 x i64> + br label %end + +cmp.false: + %a3 = bitcast <11 x double> %a to <11 x i64> + br label %end + +end: + %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i64> %phi +} + define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v11i64_to_v44i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v27, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v29, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v36, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v27, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v29, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v36, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v1, v1, v49 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v52 -; GCN-NEXT: v_or_b32_e32 v2, v2, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v3, v3, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v5, v5, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v7, v7, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v8, v8, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v9, v9, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v10, v10, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v11, v11, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v12, v12, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v14, v14, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v15, v15, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v16, v16, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v17, v17, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v21, v21, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v22, v22, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v28, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11i64_to_v44i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i64_to_v44i16: ; VI: ; %bb.0: @@ -9074,7 +19644,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -9098,9 +19668,9 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB20_2: ; %Flow +; VI-NEXT: .LBB40_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_4 +; VI-NEXT: s_cbranch_execz .LBB40_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 ; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc @@ -9146,7 +19716,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB20_4: ; %end +; VI-NEXT: .LBB40_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 @@ -9222,7 +19792,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -9246,9 +19816,9 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB20_2: ; %Flow +; GFX9-NEXT: .LBB40_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_4 +; GFX9-NEXT: s_cbranch_execz .LBB40_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc @@ -9294,7 +19864,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB20_4: ; %end +; GFX9-NEXT: .LBB40_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 @@ -9329,7 +19899,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9359,7 +19929,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: .LBB40_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9392,7 +19962,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -9416,9 +19986,9 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB40_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9470,7 +20040,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: .LBB40_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 @@ -9513,372 +20083,1283 @@ end: ret <44 x i16> %phi } +define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11i64_to_v44i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v7 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s28 +; SI-NEXT: v_mov_b32_e32 v6, s26 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v8, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s29, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s27, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s25, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s23, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s19, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s17, v11, 16 +; SI-NEXT: s_lshr_b32 s14, s6, 16 +; SI-NEXT: s_lshr_b32 s15, s8, 16 +; SI-NEXT: s_lshr_b32 s40, s10, 16 +; SI-NEXT: s_lshr_b32 s41, s12, 16 +; SI-NEXT: s_lshr_b32 s42, s29, 16 +; SI-NEXT: s_lshr_b32 s43, s27, 16 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s47, s19, 16 +; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s13, s13, 3 +; SI-NEXT: s_addc_u32 s12, s12, 0 +; SI-NEXT: s_add_u32 s11, s11, 3 +; SI-NEXT: s_addc_u32 s10, s10, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s28 +; SI-NEXT: v_mov_b32_e32 v6, s26 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v8, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s29, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s27, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s25, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s23, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s19, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s17, v11, 16 +; SI-NEXT: s_lshr_b32 s14, s6, 16 +; SI-NEXT: s_lshr_b32 s15, s8, 16 +; SI-NEXT: s_lshr_b32 s40, s10, 16 +; SI-NEXT: s_lshr_b32 s41, s12, 16 +; SI-NEXT: s_lshr_b32 s42, s29, 16 +; SI-NEXT: s_lshr_b32 s43, s27, 16 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s47, s19, 16 +; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v11i64_to_v44i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v0 +; VI-NEXT: v_readfirstlane_b32 s12, v1 +; VI-NEXT: v_readfirstlane_b32 s11, v2 +; VI-NEXT: v_readfirstlane_b32 s10, v3 +; VI-NEXT: v_readfirstlane_b32 s9, v4 +; VI-NEXT: v_readfirstlane_b32 s8, v5 +; VI-NEXT: v_readfirstlane_b32 s6, v6 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v7 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s6, 16 +; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshr_b32 s42, s10, 16 +; VI-NEXT: s_lshr_b32 s43, s11, 16 +; VI-NEXT: s_lshr_b32 s44, s12, 16 +; VI-NEXT: s_lshr_b32 s45, s13, 16 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_lshr_b32 s47, s28, 16 +; VI-NEXT: s_lshr_b32 s56, s27, 16 +; VI-NEXT: s_lshr_b32 s57, s26, 16 +; VI-NEXT: s_lshr_b32 s58, s25, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s23, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 16 +; VI-NEXT: s_lshr_b32 s62, s21, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 16 +; VI-NEXT: s_lshr_b32 s72, s19, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s6, 16 +; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshr_b32 s42, s10, 16 +; VI-NEXT: s_lshr_b32 s43, s11, 16 +; VI-NEXT: s_lshr_b32 s44, s12, 16 +; VI-NEXT: s_lshr_b32 s45, s13, 16 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_lshr_b32 s47, s28, 16 +; VI-NEXT: s_lshr_b32 s56, s27, 16 +; VI-NEXT: s_lshr_b32 s57, s26, 16 +; VI-NEXT: s_lshr_b32 s58, s25, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s23, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 16 +; VI-NEXT: s_lshr_b32 s62, s21, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 16 +; VI-NEXT: s_lshr_b32 s72, s19, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s75, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s74, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s73, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s72, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s63, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s62, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s61, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s60, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s59, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s58, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s57, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s56, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s47, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s46, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s28, s45, 16 +; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s28, s44, 16 +; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s43, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_or_b32 s6, s6, s15 +; VI-NEXT: s_or_b32 s7, s7, s14 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s13 +; VI-NEXT: v_mov_b32_e32 v15, s12 +; VI-NEXT: v_mov_b32_e32 v16, s11 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v18, s9 +; VI-NEXT: v_mov_b32_e32 v19, s8 +; VI-NEXT: v_mov_b32_e32 v20, s6 +; VI-NEXT: v_mov_b32_e32 v21, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v11i64_to_v44i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_lshr_b32 s15, s12, 16 +; GFX9-NEXT: s_lshr_b32 s40, s11, 16 +; GFX9-NEXT: s_lshr_b32 s41, s10, 16 +; GFX9-NEXT: s_lshr_b32 s42, s9, 16 +; GFX9-NEXT: s_lshr_b32 s43, s8, 16 +; GFX9-NEXT: s_lshr_b32 s44, s7, 16 +; GFX9-NEXT: s_lshr_b32 s45, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_lshr_b32 s47, s28, 16 +; GFX9-NEXT: s_lshr_b32 s56, s27, 16 +; GFX9-NEXT: s_lshr_b32 s57, s26, 16 +; GFX9-NEXT: s_lshr_b32 s58, s25, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 16 +; GFX9-NEXT: s_lshr_b32 s60, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 16 +; GFX9-NEXT: s_lshr_b32 s62, s21, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 16 +; GFX9-NEXT: s_lshr_b32 s72, s19, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_lshr_b32 s15, s12, 16 +; GFX9-NEXT: s_lshr_b32 s40, s11, 16 +; GFX9-NEXT: s_lshr_b32 s41, s10, 16 +; GFX9-NEXT: s_lshr_b32 s42, s9, 16 +; GFX9-NEXT: s_lshr_b32 s43, s8, 16 +; GFX9-NEXT: s_lshr_b32 s44, s7, 16 +; GFX9-NEXT: s_lshr_b32 s45, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_lshr_b32 s47, s28, 16 +; GFX9-NEXT: s_lshr_b32 s56, s27, 16 +; GFX9-NEXT: s_lshr_b32 s57, s26, 16 +; GFX9-NEXT: s_lshr_b32 s58, s25, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 16 +; GFX9-NEXT: s_lshr_b32 s60, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 16 +; GFX9-NEXT: s_lshr_b32 s62, s21, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 16 +; GFX9-NEXT: s_lshr_b32 s72, s19, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-TRUE16-LABEL: bitcast_v11i64_to_v44i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: .LBB41_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s26, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s27, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s26, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s27, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v19, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB41_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB41_2 +; +; GFX11-FAKE16-LABEL: bitcast_v11i64_to_v44i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s62, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-FAKE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s6, s6, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: .LBB41_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s28, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s29, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s6 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB41_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: s_branch .LBB41_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <11 x i64> %a, splat (i64 3) + %a2 = bitcast <11 x i64> %a1 to <44 x i16> + br label %end + +cmp.false: + %a3 = bitcast <11 x i64> %a to <44 x i16> + br label %end + +end: + %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x i16> %phi +} + define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v44i16_to_v11i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v32, v30 -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v57 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v47 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v45 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v44 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v43 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v41 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v59 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v8, v8, v61 -; GCN-NEXT: v_or_b32_e32 v9, v9, v60 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v22 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v0, v56, v0 -; GCN-NEXT: v_or_b32_e32 v1, v57, v1 -; GCN-NEXT: v_or_b32_e32 v2, v47, v2 -; GCN-NEXT: v_or_b32_e32 v3, v45, v3 -; GCN-NEXT: v_or_b32_e32 v4, v44, v4 -; GCN-NEXT: v_or_b32_e32 v5, v43, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v41, v7 -; GCN-NEXT: v_or_b32_e32 v8, v61, v8 -; GCN-NEXT: v_or_b32_e32 v9, v60, v9 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v22, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v22, v13 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v22, v14 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v22, v15 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v22, v16 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v22, v17 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v22, v18 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v22, v19 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v22, v20 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v22, v21 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44i16_to_v11i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:44 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v45 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: v_or_b32_e32 v2, v2, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_or_b32_e32 v6, v6, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v41 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v59 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v32 +; SI-NEXT: v_or_b32_e32 v12, v12, v63 +; SI-NEXT: v_or_b32_e32 v13, v13, v62 +; SI-NEXT: v_or_b32_e32 v14, v14, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v60 +; SI-NEXT: v_or_b32_e32 v17, v17, v58 +; SI-NEXT: v_or_b32_e32 v18, v18, v57 +; SI-NEXT: v_or_b32_e32 v19, v19, v56 +; SI-NEXT: v_or_b32_e32 v20, v20, v47 +; SI-NEXT: v_or_b32_e32 v21, v21, v46 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v43, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v5, v42, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v41, v7 +; SI-NEXT: v_or_b32_e32 v8, v34, v8 +; SI-NEXT: v_or_b32_e32 v16, v59, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v32, v11 +; SI-NEXT: v_or_b32_e32 v12, v63, v12 +; SI-NEXT: v_or_b32_e32 v13, v62, v13 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: v_or_b32_e32 v15, v60, v15 +; SI-NEXT: v_or_b32_e32 v17, v58, v17 +; SI-NEXT: v_or_b32_e32 v18, v57, v18 +; SI-NEXT: v_or_b32_e32 v19, v56, v19 +; SI-NEXT: v_or_b32_e32 v20, v47, v20 +; SI-NEXT: v_or_b32_e32 v21, v46, v21 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44i16_to_v11i64: ; VI: ; %bb.0: @@ -9915,7 +21396,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v21, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v21, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -9984,9 +21465,9 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB21_2: ; %Flow +; VI-NEXT: .LBB42_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_4 +; VI-NEXT: s_cbranch_execz .LBB42_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v21, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v45 @@ -10055,7 +21536,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v22, 3, v32 ; VI-NEXT: v_add_u16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: .LBB21_4: ; %end +; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -10145,7 +21626,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; kill: killed $vgpr22 @@ -10238,9 +21719,9 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: .LBB21_2: ; %Flow +; GFX9-NEXT: .LBB42_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -10311,7 +21792,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_4: ; %end +; GFX9-NEXT: .LBB42_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -10340,7 +21821,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -10364,7 +21845,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10420,7 +21901,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -10444,7 +21925,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -10464,444 +21945,1317 @@ end: ret <11 x i64> %phi } +define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44i16_to_v11i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v28 +; SI-NEXT: v_mov_b32_e32 v33, v26 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v8 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v4 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v29 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v9, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v10, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v11, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v20, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v60 +; SI-NEXT: v_or_b32_e32 v21, v0, v55 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v44i16_to_v11i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v39, v1 +; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v14, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v39 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v37 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v38 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v44i16_to_v11i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v7 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v5 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v36, v3 +; GFX9-NEXT: v_mov_b32_e32 v37, v2 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v39, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v11i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v33, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_mov_b32 v35, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s73 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v35.h +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s43 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v17, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB43_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; +; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v11i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB43_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <44 x i16> %a, splat (i16 3) + %a2 = bitcast <44 x i16> %a1 to <11 x i64> + br label %end + +cmp.false: + %a3 = bitcast <44 x i16> %a to <11 x i64> + br label %end + +end: + %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i64> %phi +} + define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v11i64_to_v44f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v61 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v56 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v46 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v44 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v42 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v40 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v54 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v52 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v50 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_or_b32_e32 v23, v25, v24 -; GCN-NEXT: v_or_b32_e32 v24, v27, v26 -; GCN-NEXT: v_or_b32_e32 v25, v48, v28 -; GCN-NEXT: v_or_b32_e32 v26, v38, v52 -; GCN-NEXT: v_or_b32_e32 v27, v37, v40 -; GCN-NEXT: v_or_b32_e32 v28, v35, v41 -; GCN-NEXT: v_or_b32_e32 v33, v33, v55 -; GCN-NEXT: v_or_b32_e32 v31, v31, v53 -; GCN-NEXT: v_or_b32_e32 v30, v30, v51 -; GCN-NEXT: v_or_b32_e32 v35, v46, v49 -; GCN-NEXT: v_or_b32_e32 v37, v47, v39 -; GCN-NEXT: v_or_b32_e32 v36, v56, v36 -; GCN-NEXT: v_or_b32_e32 v34, v57, v34 -; GCN-NEXT: v_or_b32_e32 v32, v58, v32 -; GCN-NEXT: v_or_b32_e32 v29, v59, v29 -; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11i64_to_v44f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i64_to_v44f16: ; VI: ; %bb.0: @@ -10931,7 +23285,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -10955,9 +23309,9 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB22_2: ; %Flow +; VI-NEXT: .LBB44_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_4 +; VI-NEXT: s_cbranch_execz .LBB44_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 ; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc @@ -11003,7 +23357,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB22_4: ; %end +; VI-NEXT: .LBB44_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 @@ -11079,7 +23433,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -11103,9 +23457,9 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB22_2: ; %Flow +; GFX9-NEXT: .LBB44_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_4 +; GFX9-NEXT: s_cbranch_execz .LBB44_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc @@ -11151,7 +23505,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB22_4: ; %end +; GFX9-NEXT: .LBB44_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 @@ -11186,7 +23540,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -11216,7 +23570,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: .LBB44_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11249,7 +23603,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -11273,9 +23627,9 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB44_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -11327,7 +23681,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: .LBB44_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 @@ -11370,465 +23724,1527 @@ end: ret <44 x half> %phi } +define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11i64_to_v44f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v1 +; SI-NEXT: v_readfirstlane_b32 s13, v2 +; SI-NEXT: v_readfirstlane_b32 s10, v3 +; SI-NEXT: v_readfirstlane_b32 s11, v4 +; SI-NEXT: v_readfirstlane_b32 s7, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v6 +; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v8 +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s14, s4, 16 +; SI-NEXT: s_lshr_b32 s15, s5, 16 +; SI-NEXT: s_add_u32 s16, s18, 3 +; SI-NEXT: s_addc_u32 s17, s19, 0 +; SI-NEXT: s_lshr_b32 s18, s16, 16 +; SI-NEXT: s_lshr_b32 s19, s17, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s40, s20, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s42, s22, 16 +; SI-NEXT: s_lshr_b32 s43, s23, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s44, s24, 16 +; SI-NEXT: s_lshr_b32 s45, s25, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s46, s26, 16 +; SI-NEXT: s_lshr_b32 s47, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s56, s28, 16 +; SI-NEXT: s_lshr_b32 s57, s29, 16 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_lshr_b32 s58, s12, 16 +; SI-NEXT: s_lshr_b32 s59, s13, 16 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_lshr_b32 s60, s10, 16 +; SI-NEXT: s_lshr_b32 s61, s11, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s62, s7, 16 +; SI-NEXT: s_lshr_b32 s63, s8, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_lshr_b32 s72, s6, 16 +; SI-NEXT: s_lshr_b32 s73, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s14 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v39, v39, v48 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v11i64_to_v44f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v0 +; VI-NEXT: v_readfirstlane_b32 s12, v1 +; VI-NEXT: v_readfirstlane_b32 s11, v2 +; VI-NEXT: v_readfirstlane_b32 s10, v3 +; VI-NEXT: v_readfirstlane_b32 s9, v4 +; VI-NEXT: v_readfirstlane_b32 s8, v5 +; VI-NEXT: v_readfirstlane_b32 s6, v6 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v7 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s6, 16 +; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshr_b32 s42, s10, 16 +; VI-NEXT: s_lshr_b32 s43, s11, 16 +; VI-NEXT: s_lshr_b32 s44, s12, 16 +; VI-NEXT: s_lshr_b32 s45, s13, 16 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_lshr_b32 s47, s28, 16 +; VI-NEXT: s_lshr_b32 s56, s27, 16 +; VI-NEXT: s_lshr_b32 s57, s26, 16 +; VI-NEXT: s_lshr_b32 s58, s25, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s23, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 16 +; VI-NEXT: s_lshr_b32 s62, s21, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 16 +; VI-NEXT: s_lshr_b32 s72, s19, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s6, 16 +; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshr_b32 s42, s10, 16 +; VI-NEXT: s_lshr_b32 s43, s11, 16 +; VI-NEXT: s_lshr_b32 s44, s12, 16 +; VI-NEXT: s_lshr_b32 s45, s13, 16 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_lshr_b32 s47, s28, 16 +; VI-NEXT: s_lshr_b32 s56, s27, 16 +; VI-NEXT: s_lshr_b32 s57, s26, 16 +; VI-NEXT: s_lshr_b32 s58, s25, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s23, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 16 +; VI-NEXT: s_lshr_b32 s62, s21, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 16 +; VI-NEXT: s_lshr_b32 s72, s19, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s75, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s74, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s73, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s72, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s63, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s62, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s61, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s60, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s59, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s58, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s57, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s56, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s47, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s46, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s28, s45, 16 +; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s28, s44, 16 +; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s43, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_or_b32 s6, s6, s15 +; VI-NEXT: s_or_b32 s7, s7, s14 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s13 +; VI-NEXT: v_mov_b32_e32 v15, s12 +; VI-NEXT: v_mov_b32_e32 v16, s11 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v18, s9 +; VI-NEXT: v_mov_b32_e32 v19, s8 +; VI-NEXT: v_mov_b32_e32 v20, s6 +; VI-NEXT: v_mov_b32_e32 v21, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v11i64_to_v44f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_lshr_b32 s15, s12, 16 +; GFX9-NEXT: s_lshr_b32 s40, s11, 16 +; GFX9-NEXT: s_lshr_b32 s41, s10, 16 +; GFX9-NEXT: s_lshr_b32 s42, s9, 16 +; GFX9-NEXT: s_lshr_b32 s43, s8, 16 +; GFX9-NEXT: s_lshr_b32 s44, s7, 16 +; GFX9-NEXT: s_lshr_b32 s45, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_lshr_b32 s47, s28, 16 +; GFX9-NEXT: s_lshr_b32 s56, s27, 16 +; GFX9-NEXT: s_lshr_b32 s57, s26, 16 +; GFX9-NEXT: s_lshr_b32 s58, s25, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 16 +; GFX9-NEXT: s_lshr_b32 s60, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 16 +; GFX9-NEXT: s_lshr_b32 s62, s21, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 16 +; GFX9-NEXT: s_lshr_b32 s72, s19, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_lshr_b32 s15, s12, 16 +; GFX9-NEXT: s_lshr_b32 s40, s11, 16 +; GFX9-NEXT: s_lshr_b32 s41, s10, 16 +; GFX9-NEXT: s_lshr_b32 s42, s9, 16 +; GFX9-NEXT: s_lshr_b32 s43, s8, 16 +; GFX9-NEXT: s_lshr_b32 s44, s7, 16 +; GFX9-NEXT: s_lshr_b32 s45, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_lshr_b32 s47, s28, 16 +; GFX9-NEXT: s_lshr_b32 s56, s27, 16 +; GFX9-NEXT: s_lshr_b32 s57, s26, 16 +; GFX9-NEXT: s_lshr_b32 s58, s25, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 16 +; GFX9-NEXT: s_lshr_b32 s60, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 16 +; GFX9-NEXT: s_lshr_b32 s62, s21, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 16 +; GFX9-NEXT: s_lshr_b32 s72, s19, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-TRUE16-LABEL: bitcast_v11i64_to_v44f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: .LBB45_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s26, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s27, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s26, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s27, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v19, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB45_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB45_2 +; +; GFX11-FAKE16-LABEL: bitcast_v11i64_to_v44f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s62, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-FAKE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s6, s6, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: .LBB45_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s28, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s29, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s6 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB45_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: s_branch .LBB45_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <11 x i64> %a, splat (i64 3) + %a2 = bitcast <11 x i64> %a1 to <44 x half> + br label %end + +cmp.false: + %a3 = bitcast <11 x i64> %a to <44 x half> + br label %end + +end: + %phi = phi <44 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x half> %phi +} + define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v44f16_to_v11i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v19 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; GCN-NEXT: v_or_b32_e32 v0, v50, v0 -; GCN-NEXT: v_or_b32_e32 v1, v48, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v34, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; GCN-NEXT: v_or_b32_e32 v6, v62, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; GCN-NEXT: v_or_b32_e32 v7, v60, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v54 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v22, v8 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v22, v9 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v22, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v22, v13 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v22, v14 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v22, v15 -; GCN-NEXT: v_or_b32_e32 v16, v45, v16 -; GCN-NEXT: v_or_b32_e32 v17, v43, v17 -; GCN-NEXT: v_or_b32_e32 v18, v41, v18 -; GCN-NEXT: v_or_b32_e32 v19, v55, v19 -; GCN-NEXT: v_or_b32_e32 v20, v53, v20 -; GCN-NEXT: v_or_b32_e32 v21, v52, v21 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v48 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v38 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v36 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v32 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v62 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v60 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v59 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v58 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v57 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v56 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v52 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_or_b32_e32 v9, v11, v10 -; GCN-NEXT: v_or_b32_e32 v10, v13, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_or_b32_e32 v12, v17, v16 -; GCN-NEXT: v_or_b32_e32 v13, v19, v18 -; GCN-NEXT: v_or_b32_e32 v14, v21, v20 -; GCN-NEXT: v_or_b32_e32 v15, v23, v22 -; GCN-NEXT: v_or_b32_e32 v16, v25, v24 -; GCN-NEXT: v_or_b32_e32 v17, v27, v26 -; GCN-NEXT: v_or_b32_e32 v18, v29, v28 -; GCN-NEXT: v_or_b32_e32 v19, v31, v30 -; GCN-NEXT: v_or_b32_e32 v20, v33, v32 -; GCN-NEXT: v_or_b32_e32 v21, v35, v34 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44f16_to_v11i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_or_b32_e32 v20, v54, v20 +; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_or_b32_e32 v19, v40, v19 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v41 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v54 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44f16_to_v11i64: ; VI: ; %bb.0: @@ -11865,7 +25281,7 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v21, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v21, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -11934,9 +25350,9 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB23_2: ; %Flow +; VI-NEXT: .LBB46_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_4 +; VI-NEXT: s_cbranch_execz .LBB46_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v21, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v45, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -12005,7 +25421,7 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: .LBB23_4: ; %end +; VI-NEXT: .LBB46_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -12095,7 +25511,7 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; kill: killed $vgpr22 @@ -12188,9 +25604,9 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: .LBB23_2: ; %Flow +; GFX9-NEXT: .LBB46_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -12262,7 +25678,7 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_4: ; %end +; GFX9-NEXT: .LBB46_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -12291,7 +25707,7 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -12315,7 +25731,7 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12371,7 +25787,7 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -12395,7 +25811,7 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -12415,209 +25831,1239 @@ end: ret <11 x i64> %phi } +define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44f16_to_v11i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v5, v28, v5 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v50, v10 +; SI-NEXT: v_or_b32_e32 v11, v59, v11 +; SI-NEXT: v_or_b32_e32 v12, v57, v12 +; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v43, v15 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v55, v17 +; SI-NEXT: v_or_b32_e32 v18, v53, v18 +; SI-NEXT: v_or_b32_e32 v19, v31, v19 +; SI-NEXT: v_or_b32_e32 v20, v24, v20 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v40 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: v_mov_b32_e32 v41, v52 +; SI-NEXT: v_mov_b32_e32 v52, v23 +; SI-NEXT: v_mov_b32_e32 v48, v60 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v42 +; SI-NEXT: v_mov_b32_e32 v42, v53 +; SI-NEXT: v_mov_b32_e32 v53, v22 +; SI-NEXT: v_mov_b32_e32 v35, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v43 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v54, v24 +; SI-NEXT: v_mov_b32_e32 v50, v34 +; SI-NEXT: v_mov_b32_e32 v34, v62 +; SI-NEXT: v_mov_b32_e32 v62, v57 +; SI-NEXT: v_mov_b32_e32 v57, v44 +; SI-NEXT: v_mov_b32_e32 v44, v55 +; SI-NEXT: v_mov_b32_e32 v55, v25 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v45, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v38, v27 +; SI-NEXT: v_mov_b32_e32 v37, v28 +; SI-NEXT: v_mov_b32_e32 v49, v36 +; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v28, v37 +; SI-NEXT: v_mov_b32_e32 v27, v38 +; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v45 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v63, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v25, v55 +; SI-NEXT: v_mov_b32_e32 v55, v44 +; SI-NEXT: v_mov_b32_e32 v44, v57 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v62, v34 +; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v24, v54 +; SI-NEXT: v_mov_b32_e32 v54, v43 +; SI-NEXT: v_mov_b32_e32 v43, v56 +; SI-NEXT: v_mov_b32_e32 v56, v61 +; SI-NEXT: v_mov_b32_e32 v61, v35 +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v60, v48 +; SI-NEXT: v_mov_b32_e32 v23, v52 +; SI-NEXT: v_mov_b32_e32 v52, v41 +; SI-NEXT: v_mov_b32_e32 v41, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v44f16_to_v11i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v32, v7 +; VI-NEXT: v_mov_b32_e32 v33, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB47_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v21, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v39, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v38, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v37, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v36, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v35, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v33, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v20, v22, v20 +; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v44f16_to_v11i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v7 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v5 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v36, v3 +; GFX9-NEXT: v_mov_b32_e32 v37, v2 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v39, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB47_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB47_2 +; +; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v11i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v33, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_mov_b32 v35, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s73 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v35.h +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s43 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v17, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB47_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; +; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v11i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB47_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <44 x half> %a, splat (half 0xH0200) + %a2 = bitcast <44 x half> %a1 to <11 x i64> + br label %end + +cmp.false: + %a3 = bitcast <44 x half> %a to <11 x i64> + br label %end + +end: + %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i64> %phi +} + define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v11f64_to_v44i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v27, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v28, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v29, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v30, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v38, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v27, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v28, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v29, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v30, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v38, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v1, v1, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v52 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 4, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 8, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v5, v5, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v7, v7, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 24, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v8, v8, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 28, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v9, v9, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 32, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v10, v10, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v11, v11, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 40, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v12, v12, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v14, v14, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 52, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v15, v15, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 56, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v16, v16, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v17, v17, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v18, v18, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v20, v20, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x4c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v21, v21, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x50, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x54, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11f64_to_v44i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f64_to_v44i16: ; VI: ; %bb.0: @@ -12647,7 +27093,7 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -12671,9 +27117,9 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -12708,7 +27154,7 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 @@ -12784,7 +27230,7 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -12808,9 +27254,9 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -12845,7 +27291,7 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 @@ -12880,7 +27326,7 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -12893,7 +27339,7 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: .LBB48_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12926,7 +27372,7 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -12950,9 +27396,9 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -12980,39 +27426,904 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <11 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <11 x double> %a1 to <44 x i16> + br label %end + +cmp.false: + %a3 = bitcast <11 x double> %a to <44 x i16> + br label %end + +end: + %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x i16> %phi +} + +define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11f64_to_v44i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_mov_b32_e32 v21, s16 +; SI-NEXT: v_mov_b32_e32 v22, s17 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_mov_b32_e32 v20, s19 +; SI-NEXT: v_mov_b32_e32 v17, s20 +; SI-NEXT: v_mov_b32_e32 v18, s21 +; SI-NEXT: v_mov_b32_e32 v15, s22 +; SI-NEXT: v_mov_b32_e32 v16, s23 +; SI-NEXT: v_mov_b32_e32 v13, s24 +; SI-NEXT: v_mov_b32_e32 v14, s25 +; SI-NEXT: v_mov_b32_e32 v11, s26 +; SI-NEXT: v_mov_b32_e32 v12, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s29 +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v23, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v24, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v25, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v26, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v27, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v34, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v36, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v21, 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_alignbit_b32 v23, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v24, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v25, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v26, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v27, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v34, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v36, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v21, 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v49 +; SI-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v11f64_to_v44i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v23, s17 +; VI-NEXT: v_mov_b32_e32 v20, s18 +; VI-NEXT: v_mov_b32_e32 v21, s19 +; VI-NEXT: v_mov_b32_e32 v18, s20 +; VI-NEXT: v_mov_b32_e32 v19, s21 +; VI-NEXT: v_mov_b32_e32 v14, s22 +; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v9, s24 +; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: v_mov_b32_e32 v16, s26 +; VI-NEXT: v_mov_b32_e32 v17, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v24, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; VI-NEXT: v_or_b32_sdwa v22, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; VI-NEXT: v_or_b32_sdwa v23, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v49 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_or_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; VI-NEXT: v_or_b32_sdwa v11, v17, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v22 +; VI-NEXT: v_mov_b32_e32 v7, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v11f64_to_v44i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_mov_b32_e32 v22, s16 +; GFX9-NEXT: v_mov_b32_e32 v23, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_mov_b32_e32 v21, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_mov_b32_e32 v19, s21 +; GFX9-NEXT: v_mov_b32_e32 v14, s22 +; GFX9-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s24 +; GFX9-NEXT: v_mov_b32_e32 v10, s25 +; GFX9-NEXT: v_mov_b32_e32 v16, s26 +; GFX9-NEXT: v_mov_b32_e32 v17, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v11, v11, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v17, v34, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v20, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v9, v49, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v48, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v38, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v21, v30, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v22 +; GFX9-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: s_branch .LBB49_2 +; +; GFX11-TRUE16-LABEL: bitcast_v11f64_to_v44i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-TRUE16-NEXT: .LBB49_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v50, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v48, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v39, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v37, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v36, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v11, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v35, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; +; GFX11-FAKE16-LABEL: bitcast_v11f64_to_v44i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: .LBB49_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v48, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v39, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v36, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v11, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -13031,371 +28342,364 @@ end: } define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v44i16_to_v11f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v32, v30 -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v57 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v47 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v45 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v44 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v43 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v41 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v59 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v8, v8, v61 -; GCN-NEXT: v_or_b32_e32 v9, v9, v60 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v22 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v0, v56, v0 -; GCN-NEXT: v_or_b32_e32 v1, v57, v1 -; GCN-NEXT: v_or_b32_e32 v2, v47, v2 -; GCN-NEXT: v_or_b32_e32 v3, v45, v3 -; GCN-NEXT: v_or_b32_e32 v4, v44, v4 -; GCN-NEXT: v_or_b32_e32 v5, v43, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v41, v7 -; GCN-NEXT: v_or_b32_e32 v8, v61, v8 -; GCN-NEXT: v_or_b32_e32 v9, v60, v9 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v22, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v22, v13 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v22, v14 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v22, v15 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v22, v16 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v22, v17 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v22, v18 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v22, v19 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v22, v20 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v22, v21 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44i16_to_v11f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:44 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v45 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: v_or_b32_e32 v2, v2, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_or_b32_e32 v6, v6, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v41 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v59 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v32 +; SI-NEXT: v_or_b32_e32 v12, v12, v63 +; SI-NEXT: v_or_b32_e32 v13, v13, v62 +; SI-NEXT: v_or_b32_e32 v14, v14, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v60 +; SI-NEXT: v_or_b32_e32 v17, v17, v58 +; SI-NEXT: v_or_b32_e32 v18, v18, v57 +; SI-NEXT: v_or_b32_e32 v19, v19, v56 +; SI-NEXT: v_or_b32_e32 v20, v20, v47 +; SI-NEXT: v_or_b32_e32 v21, v21, v46 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v43, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v5, v42, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v41, v7 +; SI-NEXT: v_or_b32_e32 v8, v34, v8 +; SI-NEXT: v_or_b32_e32 v16, v59, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v32, v11 +; SI-NEXT: v_or_b32_e32 v12, v63, v12 +; SI-NEXT: v_or_b32_e32 v13, v62, v13 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: v_or_b32_e32 v15, v60, v15 +; SI-NEXT: v_or_b32_e32 v17, v58, v17 +; SI-NEXT: v_or_b32_e32 v18, v57, v18 +; SI-NEXT: v_or_b32_e32 v19, v56, v19 +; SI-NEXT: v_or_b32_e32 v20, v47, v20 +; SI-NEXT: v_or_b32_e32 v21, v46, v21 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44i16_to_v11f64: ; VI: ; %bb.0: @@ -13432,7 +28736,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v21, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v21, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -13501,9 +28805,9 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v21, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v45 @@ -13572,7 +28876,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v22, 3, v32 ; VI-NEXT: v_add_u16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -13662,7 +28966,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; kill: killed $vgpr22 @@ -13755,9 +29059,9 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: .LBB25_2: ; %Flow +; GFX9-NEXT: .LBB50_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 +; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -13828,7 +29132,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: .LBB50_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -13857,7 +29161,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -13881,7 +29185,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13937,7 +29241,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -13961,7 +29265,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -13981,422 +29285,1295 @@ end: ret <11 x double> %phi } +define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44i16_to_v11f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v28 +; SI-NEXT: v_mov_b32_e32 v33, v26 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v8 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v4 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v29 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v9, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v10, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v11, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v20, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v60 +; SI-NEXT: v_or_b32_e32 v21, v0, v55 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v44i16_to_v11f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v39, v1 +; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v14, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v39 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v37 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v38 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v33 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v44i16_to_v11f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v7 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v5 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v36, v3 +; GFX9-NEXT: v_mov_b32_e32 v37, v2 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v39, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v11f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v33, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_mov_b32 v35, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s73 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v35.h +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s43 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v17, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v11f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <44 x i16> %a, splat (i16 3) + %a2 = bitcast <44 x i16> %a1 to <11 x double> + br label %end + +cmp.false: + %a3 = bitcast <44 x i16> %a to <11 x double> + br label %end + +end: + %phi = phi <11 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x double> %phi +} + define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v11f64_to_v44f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v61 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v56 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v46 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v44 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v42 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v40 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v54 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v52 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v50 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_or_b32_e32 v23, v25, v24 -; GCN-NEXT: v_or_b32_e32 v24, v27, v26 -; GCN-NEXT: v_or_b32_e32 v25, v48, v28 -; GCN-NEXT: v_or_b32_e32 v26, v38, v52 -; GCN-NEXT: v_or_b32_e32 v27, v37, v40 -; GCN-NEXT: v_or_b32_e32 v28, v35, v41 -; GCN-NEXT: v_or_b32_e32 v33, v33, v55 -; GCN-NEXT: v_or_b32_e32 v31, v31, v53 -; GCN-NEXT: v_or_b32_e32 v30, v30, v51 -; GCN-NEXT: v_or_b32_e32 v35, v46, v49 -; GCN-NEXT: v_or_b32_e32 v37, v47, v39 -; GCN-NEXT: v_or_b32_e32 v36, v56, v36 -; GCN-NEXT: v_or_b32_e32 v34, v57, v34 -; GCN-NEXT: v_or_b32_e32 v32, v58, v32 -; GCN-NEXT: v_or_b32_e32 v29, v59, v29 -; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11f64_to_v44f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f64_to_v44f16: ; VI: ; %bb.0: @@ -14426,7 +30603,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -14450,9 +30627,9 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB26_2: ; %Flow +; VI-NEXT: .LBB52_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_4 +; VI-NEXT: s_cbranch_execz .LBB52_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -14487,7 +30664,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB26_4: ; %end +; VI-NEXT: .LBB52_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 @@ -14563,7 +30740,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -14587,9 +30764,9 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB26_2: ; %Flow +; GFX9-NEXT: .LBB52_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_4 +; GFX9-NEXT: s_cbranch_execz .LBB52_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -14624,7 +30801,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB26_4: ; %end +; GFX9-NEXT: .LBB52_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 @@ -14659,7 +30836,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -14672,7 +30849,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -14705,7 +30882,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -14729,9 +30906,9 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -14766,7 +30943,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_4: ; %end +; GFX11-FAKE16-NEXT: .LBB52_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 @@ -14809,465 +30986,1516 @@ end: ret <44 x half> %phi } +define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11f64_to_v44f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v1 +; SI-NEXT: v_readfirstlane_b32 s11, v2 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_readfirstlane_b32 s7, v6 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: s_and_b64 s[12:13], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v8 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s12, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s12 +; SI-NEXT: s_lshr_b32 s12, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; SI-NEXT: s_lshr_b32 s12, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s12 +; SI-NEXT: s_lshr_b32 s12, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 +; SI-NEXT: s_lshr_b32 s12, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 +; SI-NEXT: s_lshr_b32 s12, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 +; SI-NEXT: s_lshr_b32 s12, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 +; SI-NEXT: s_lshr_b32 s12, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: s_lshr_b32 s12, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 +; SI-NEXT: s_lshr_b32 s12, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 +; SI-NEXT: s_lshr_b32 s12, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 +; SI-NEXT: s_lshr_b32 s12, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s12 +; SI-NEXT: s_lshr_b32 s12, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s12 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s12 +; SI-NEXT: s_lshr_b32 s12, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s12 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s12 +; SI-NEXT: s_lshr_b32 s12, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s12 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s12 +; SI-NEXT: s_lshr_b32 s12, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[36:37], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[33:34], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[29:30], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[25:26], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[21:22], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[18:19], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v39, v39, v48 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v11f64_to_v44f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v23, s17 +; VI-NEXT: v_mov_b32_e32 v20, s18 +; VI-NEXT: v_mov_b32_e32 v21, s19 +; VI-NEXT: v_mov_b32_e32 v18, s20 +; VI-NEXT: v_mov_b32_e32 v19, s21 +; VI-NEXT: v_mov_b32_e32 v14, s22 +; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v9, s24 +; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: v_mov_b32_e32 v16, s26 +; VI-NEXT: v_mov_b32_e32 v17, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v24, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; VI-NEXT: v_or_b32_sdwa v22, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; VI-NEXT: v_or_b32_sdwa v23, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v49 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_or_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; VI-NEXT: v_or_b32_sdwa v11, v17, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v22 +; VI-NEXT: v_mov_b32_e32 v7, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v11f64_to_v44f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_mov_b32_e32 v22, s16 +; GFX9-NEXT: v_mov_b32_e32 v23, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_mov_b32_e32 v21, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_mov_b32_e32 v19, s21 +; GFX9-NEXT: v_mov_b32_e32 v14, s22 +; GFX9-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s24 +; GFX9-NEXT: v_mov_b32_e32 v10, s25 +; GFX9-NEXT: v_mov_b32_e32 v16, s26 +; GFX9-NEXT: v_mov_b32_e32 v17, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v11, v11, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v17, v34, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v20, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v9, v49, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v48, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v38, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v21, v30, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v22 +; GFX9-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-TRUE16-LABEL: bitcast_v11f64_to_v44f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-TRUE16-NEXT: .LBB53_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v50, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v48, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v39, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v37, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v36, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v11, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v35, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB53_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB53_2 +; +; GFX11-FAKE16-LABEL: bitcast_v11f64_to_v44f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: .LBB53_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v48, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v39, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v36, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v11, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB53_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <11 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <11 x double> %a1 to <44 x half> + br label %end + +cmp.false: + %a3 = bitcast <11 x double> %a to <44 x half> + br label %end + +end: + %phi = phi <44 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x half> %phi +} + define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v44f16_to_v11f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v19 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; GCN-NEXT: v_or_b32_e32 v0, v50, v0 -; GCN-NEXT: v_or_b32_e32 v1, v48, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v34, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; GCN-NEXT: v_or_b32_e32 v6, v62, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; GCN-NEXT: v_or_b32_e32 v7, v60, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v54 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v22, v8 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v22, v9 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v22, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v22, v13 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v22, v14 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v22, v15 -; GCN-NEXT: v_or_b32_e32 v16, v45, v16 -; GCN-NEXT: v_or_b32_e32 v17, v43, v17 -; GCN-NEXT: v_or_b32_e32 v18, v41, v18 -; GCN-NEXT: v_or_b32_e32 v19, v55, v19 -; GCN-NEXT: v_or_b32_e32 v20, v53, v20 -; GCN-NEXT: v_or_b32_e32 v21, v52, v21 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v48 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v38 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v36 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v32 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v62 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v60 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v59 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v58 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v57 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v56 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v52 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_or_b32_e32 v9, v11, v10 -; GCN-NEXT: v_or_b32_e32 v10, v13, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_or_b32_e32 v12, v17, v16 -; GCN-NEXT: v_or_b32_e32 v13, v19, v18 -; GCN-NEXT: v_or_b32_e32 v14, v21, v20 -; GCN-NEXT: v_or_b32_e32 v15, v23, v22 -; GCN-NEXT: v_or_b32_e32 v16, v25, v24 -; GCN-NEXT: v_or_b32_e32 v17, v27, v26 -; GCN-NEXT: v_or_b32_e32 v18, v29, v28 -; GCN-NEXT: v_or_b32_e32 v19, v31, v30 -; GCN-NEXT: v_or_b32_e32 v20, v33, v32 -; GCN-NEXT: v_or_b32_e32 v21, v35, v34 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44f16_to_v11f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_or_b32_e32 v20, v54, v20 +; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_or_b32_e32 v19, v40, v19 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v41 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v54 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44f16_to_v11f64: ; VI: ; %bb.0: @@ -15304,7 +32532,7 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v21, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v21, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -15373,9 +32601,9 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: .LBB54_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_4 +; VI-NEXT: s_cbranch_execz .LBB54_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v21, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v45, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -15444,7 +32672,7 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: .LBB27_4: ; %end +; VI-NEXT: .LBB54_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -15534,7 +32762,7 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; kill: killed $vgpr22 @@ -15627,9 +32855,9 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: .LBB54_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -15701,7 +32929,7 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_4: ; %end +; GFX9-NEXT: .LBB54_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -15730,7 +32958,7 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -15754,7 +32982,7 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: .LBB54_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15810,7 +33038,7 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -15834,7 +33062,7 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB27_2: ; %end +; GFX11-FAKE16-NEXT: .LBB54_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -15854,607 +33082,1630 @@ end: ret <11 x double> %phi } +define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44f16_to_v11f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v5, v28, v5 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v50, v10 +; SI-NEXT: v_or_b32_e32 v11, v59, v11 +; SI-NEXT: v_or_b32_e32 v12, v57, v12 +; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v43, v15 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v55, v17 +; SI-NEXT: v_or_b32_e32 v18, v53, v18 +; SI-NEXT: v_or_b32_e32 v19, v31, v19 +; SI-NEXT: v_or_b32_e32 v20, v24, v20 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v40 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: v_mov_b32_e32 v41, v52 +; SI-NEXT: v_mov_b32_e32 v52, v23 +; SI-NEXT: v_mov_b32_e32 v48, v60 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v42 +; SI-NEXT: v_mov_b32_e32 v42, v53 +; SI-NEXT: v_mov_b32_e32 v53, v22 +; SI-NEXT: v_mov_b32_e32 v35, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v43 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v54, v24 +; SI-NEXT: v_mov_b32_e32 v50, v34 +; SI-NEXT: v_mov_b32_e32 v34, v62 +; SI-NEXT: v_mov_b32_e32 v62, v57 +; SI-NEXT: v_mov_b32_e32 v57, v44 +; SI-NEXT: v_mov_b32_e32 v44, v55 +; SI-NEXT: v_mov_b32_e32 v55, v25 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v45, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v38, v27 +; SI-NEXT: v_mov_b32_e32 v37, v28 +; SI-NEXT: v_mov_b32_e32 v49, v36 +; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v28, v37 +; SI-NEXT: v_mov_b32_e32 v27, v38 +; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v45 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v63, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v25, v55 +; SI-NEXT: v_mov_b32_e32 v55, v44 +; SI-NEXT: v_mov_b32_e32 v44, v57 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v62, v34 +; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v24, v54 +; SI-NEXT: v_mov_b32_e32 v54, v43 +; SI-NEXT: v_mov_b32_e32 v43, v56 +; SI-NEXT: v_mov_b32_e32 v56, v61 +; SI-NEXT: v_mov_b32_e32 v61, v35 +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v60, v48 +; SI-NEXT: v_mov_b32_e32 v23, v52 +; SI-NEXT: v_mov_b32_e32 v52, v41 +; SI-NEXT: v_mov_b32_e32 v41, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v44f16_to_v11f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v32, v7 +; VI-NEXT: v_mov_b32_e32 v33, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v21, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v39, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v38, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v37, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v36, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v35, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v33, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v20, v22, v20 +; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v44f16_to_v11f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v7 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v5 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v36, v3 +; GFX9-NEXT: v_mov_b32_e32 v37, v2 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v39, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v11f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v33, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_mov_b32 v35, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s73 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v35.h +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s43 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v17, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB55_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB55_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB55_2 +; +; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v11f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB55_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB55_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB55_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <44 x half> %a, splat (half 0xH0200) + %a2 = bitcast <44 x half> %a1 to <11 x double> + br label %end + +cmp.false: + %a3 = bitcast <44 x half> %a to <11 x double> + br label %end + +end: + %phi = phi <11 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x double> %phi +} + define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v44i16_to_v44f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; kill: killed $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; kill: killed $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; kill: killed $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v51 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v53 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v55 -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v53 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v59 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v60 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v61 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v62 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v63 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v31 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v32 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v34 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v35 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v36 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v37 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v38 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v58 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v57 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_or_b32_e32 v23, v25, v24 -; GCN-NEXT: v_or_b32_e32 v24, v27, v26 -; GCN-NEXT: v_or_b32_e32 v25, v29, v28 -; GCN-NEXT: v_or_b32_e32 v26, v32, v31 -; GCN-NEXT: v_or_b32_e32 v27, v35, v34 -; GCN-NEXT: v_or_b32_e32 v28, v38, v37 -; GCN-NEXT: v_or_b32_e32 v29, v51, v39 -; GCN-NEXT: v_or_b32_e32 v31, v53, v48 -; GCN-NEXT: v_or_b32_e32 v32, v55, v49 -; GCN-NEXT: v_or_b32_e32 v34, v41, v40 -; GCN-NEXT: v_or_b32_e32 v35, v43, v42 -; GCN-NEXT: v_or_b32_e32 v37, v45, v44 -; GCN-NEXT: v_or_b32_e32 v38, v47, v46 -; GCN-NEXT: v_or_b32_e32 v39, v57, v56 -; GCN-NEXT: v_or_b32_e32 v48, v59, v58 -; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44i16_to_v44f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v37, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v20 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44i16_to_v44f16: ; VI: ; %bb.0: @@ -16485,7 +34736,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v22, 3, v22 @@ -16531,7 +34782,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v24, 3, v24 ; VI-NEXT: v_add_u16_e32 v21, 3, v21 ; VI-NEXT: v_add_u16_e32 v23, 3, v23 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -16608,7 +34859,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v21, v51, v21, s6 @@ -16677,7 +34928,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v22, v0, s4 @@ -16712,7 +34963,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -16736,7 +34987,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: .LBB56_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16770,7 +35021,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v21, v52, v21, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v20, v51, v20, 0x5040100 @@ -16838,7 +35089,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: .LBB56_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v23, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v24, v1, 0x5040100 @@ -16880,404 +35131,1630 @@ end: ret <44 x half> %phi } +define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44i16_to_v44f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s21 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v6 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v20 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v44i16_to_v44f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s15, s18, s15 +; VI-NEXT: s_and_b32 s18, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s14, s18, s14 +; VI-NEXT: s_and_b32 s18, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s13, s18, s13 +; VI-NEXT: s_and_b32 s18, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: s_or_b32 s12, s18, s12 +; VI-NEXT: s_and_b32 s18, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: v_or_b32_sdwa v14, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: s_or_b32 s11, s18, s11 +; VI-NEXT: s_and_b32 s18, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: s_or_b32 s10, s18, s10 +; VI-NEXT: s_and_b32 s18, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; VI-NEXT: s_or_b32 s9, s18, s9 +; VI-NEXT: s_and_b32 s18, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; VI-NEXT: s_or_b32 s8, s18, s8 +; VI-NEXT: s_and_b32 s18, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; VI-NEXT: s_or_b32 s7, s18, s7 +; VI-NEXT: s_and_b32 s18, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; VI-NEXT: s_or_b32 s6, s18, s6 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v44i16_to_v44f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_pk_add_u16 v13, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_u16 v12, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_u16 v11, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_u16 v10, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_u16 v9, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_u16 v8, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_u16 v23, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_u16 v22, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_u16 v29, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_u16 v28, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pk_add_u16 v26, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v25, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: s_branch .LBB57_5 +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s23 +; GFX9-NEXT: v_mov_b32_e32 v22, s22 +; GFX9-NEXT: v_mov_b32_e32 v29, s21 +; GFX9-NEXT: v_mov_b32_e32 v28, s20 +; GFX9-NEXT: v_mov_b32_e32 v27, s19 +; GFX9-NEXT: v_mov_b32_e32 v26, s18 +; GFX9-NEXT: v_mov_b32_e32 v25, s17 +; GFX9-NEXT: v_mov_b32_e32 v24, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s43 +; GFX9-NEXT: v_mov_b32_e32 v31, s42 +; GFX9-NEXT: v_mov_b32_e32 v32, s41 +; GFX9-NEXT: v_mov_b32_e32 v33, s40 +; GFX9-NEXT: v_mov_b32_e32 v34, s15 +; GFX9-NEXT: v_mov_b32_e32 v35, s14 +; GFX9-NEXT: v_mov_b32_e32 v36, s13 +; GFX9-NEXT: v_mov_b32_e32 v37, s12 +; GFX9-NEXT: v_mov_b32_e32 v38, s11 +; GFX9-NEXT: v_mov_b32_e32 v39, s10 +; GFX9-NEXT: v_mov_b32_e32 v48, s9 +; GFX9-NEXT: v_mov_b32_e32 v49, s8 +; GFX9-NEXT: v_mov_b32_e32 v50, s7 +; GFX9-NEXT: v_mov_b32_e32 v51, s6 +; GFX9-NEXT: .LBB57_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v24, v51, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v50, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v49, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v48, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v39, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v38, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v22, v37, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v36, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v35, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v34, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v33, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v32, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v31, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v30, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v22 +; GFX9-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v44f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v7, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v5, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s14, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s13, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s12, s9 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s11, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s9, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v48.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v49.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v50.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v51.l +; GFX11-TRUE16-NEXT: s_branch .LBB57_5 +; GFX11-TRUE16-NEXT: .LBB57_3: +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s7 +; GFX11-TRUE16-NEXT: .LBB57_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v28, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v38, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v39, 16, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v29, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v21, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v30, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v27, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v19, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v37, 16, v38 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v44f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: s_branch .LBB57_5 +; GFX11-FAKE16-NEXT: .LBB57_3: +; GFX11-FAKE16-NEXT: s_branch .LBB57_2 +; GFX11-FAKE16-NEXT: .LBB57_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v12, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s27 :: v_dual_mov_b32 v14, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s25 :: v_dual_mov_b32 v16, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s23 :: v_dual_mov_b32 v8, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s19 :: v_dual_mov_b32 v4, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v23, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s1 :: v_dual_mov_b32 v25, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s45 :: v_dual_mov_b32 v27, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s41 :: v_dual_mov_b32 v31, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s15 :: v_dual_mov_b32 v33, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s13 :: v_dual_mov_b32 v35, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s11 :: v_dual_mov_b32 v37, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s9 :: v_dual_mov_b32 v39, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s6 :: v_dual_mov_b32 v49, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s5 +; GFX11-FAKE16-NEXT: .LBB57_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v51, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v50, 16, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v49, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v48, 16, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v38, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v39, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v37, 16, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v35, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v30, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v28, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v27, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <44 x i16> %a, splat (i16 3) + %a2 = bitcast <44 x i16> %a1 to <44 x half> + br label %end + +cmp.false: + %a3 = bitcast <44 x i16> %a to <44 x half> + br label %end + +end: + %phi = phi <44 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x half> %phi +} + define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v44f16_to_v44i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:48 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v30 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v41 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v42 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v6 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v6 -; GCN-NEXT: v_or_b32_e32 v4, v4, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v9 -; GCN-NEXT: v_or_b32_e32 v7, v7, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_or_b32_e32 v11, v11, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v10 -; GCN-NEXT: v_or_b32_e32 v8, v8, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_or_b32_e32 v12, v12, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v16 -; GCN-NEXT: v_or_b32_e32 v15, v15, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v32 -; GCN-NEXT: v_or_b32_e32 v31, v27, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v33 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v34 -; GCN-NEXT: v_or_b32_e32 v33, v27, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v35 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v36 -; GCN-NEXT: v_or_b32_e32 v35, v27, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v54, v28, v27 -; GCN-NEXT: v_or_b32_e32 v52, v30, v29 -; GCN-NEXT: v_or_b32_e32 v50, v50, v53 -; GCN-NEXT: v_or_b32_e32 v48, v48, v51 -; GCN-NEXT: v_or_b32_e32 v38, v38, v49 -; GCN-NEXT: v_or_b32_e32 v37, v37, v39 -; GCN-NEXT: v_or_b32_e32 v20, v20, v25 -; GCN-NEXT: v_or_b32_e32 v18, v18, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v26 -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: v_or_b32_e32 v17, v17, v21 -; GCN-NEXT: v_alignbit_b32 v40, v35, v27, 16 -; GCN-NEXT: v_alignbit_b32 v55, v33, v29, 16 -; GCN-NEXT: v_alignbit_b32 v53, v31, v53, 16 -; GCN-NEXT: v_alignbit_b32 v51, v15, v51, 16 -; GCN-NEXT: v_alignbit_b32 v49, v12, v49, 16 -; GCN-NEXT: v_alignbit_b32 v39, v8, v39, 16 -; GCN-NEXT: v_alignbit_b32 v25, v3, v25, 16 -; GCN-NEXT: v_alignbit_b32 v23, v11, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v7, v26, 16 -; GCN-NEXT: v_alignbit_b32 v24, v4, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v1, v21, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v40 -; GCN-NEXT: v_or_b32_e32 v27, v27, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v36 -; GCN-NEXT: v_or_b32_e32 v28, v28, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v55 -; GCN-NEXT: v_or_b32_e32 v30, v30, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v33, v33, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v53 -; GCN-NEXT: v_or_b32_e32 v36, v36, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v31, v31, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v48, v48, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v38, v38, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v12, v12, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v37, v37, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v8, v8, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v20, v20, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v18, v18, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v11, v11, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v22, v22, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v17, v17, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; GCN-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44f16_to_v44i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v30 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v46 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v57 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v8, v8, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v31, v26, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_or_b32_e32 v34, v26, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v37, v26, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_or_b32_e32 v48, v26, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v51, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v55 +; SI-NEXT: v_or_b32_e32 v50, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v39, v30, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v53 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v36 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v36, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v33, v33, v30 +; SI-NEXT: v_or_b32_e32 v21, v21, v52 +; SI-NEXT: v_or_b32_e32 v17, v17, v25 +; SI-NEXT: v_or_b32_e32 v13, v13, v24 +; SI-NEXT: v_or_b32_e32 v7, v7, v22 +; SI-NEXT: v_or_b32_e32 v16, v16, v23 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_alignbit_b32 v41, v48, v26, 16 +; SI-NEXT: v_alignbit_b32 v40, v37, v27, 16 +; SI-NEXT: v_alignbit_b32 v55, v34, v28, 16 +; SI-NEXT: v_alignbit_b32 v54, v31, v29, 16 +; SI-NEXT: v_alignbit_b32 v53, v19, v30, 16 +; SI-NEXT: v_alignbit_b32 v52, v14, v52, 16 +; SI-NEXT: v_alignbit_b32 v25, v11, v25, 16 +; SI-NEXT: v_alignbit_b32 v24, v5, v24, 16 +; SI-NEXT: v_alignbit_b32 v22, v1, v22, 16 +; SI-NEXT: v_alignbit_b32 v23, v8, v23, 16 +; SI-NEXT: v_alignbit_b32 v18, v2, v18, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v40 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v55 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v54 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v32 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v52 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 48, v0 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 56, v0 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44f16_to_v44i16: ; VI: ; %bb.0: @@ -17308,7 +36785,7 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 @@ -17354,7 +36831,7 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 ; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 ; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -17431,7 +36908,7 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v21, v51, v21, s6 @@ -17501,7 +36978,7 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v22, v0, s4 @@ -17536,7 +37013,7 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -17560,7 +37037,7 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17594,7 +37071,7 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v21, v52, v21, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v20, v51, v20, 0x5040100 @@ -17662,7 +37139,7 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: .LBB58_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v23, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v24, v1, 0x5040100 @@ -17703,3 +37180,1129 @@ end: %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <44 x i16> %phi } + +define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44f16_to_v44i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v9, v9, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_or_b32_e32 v13, v13, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_or_b32_e32 v31, v31, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_or_b32_e32 v34, v34, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_or_b32_e32 v37, v37, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v18, v18, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; SI-NEXT: v_or_b32_e32 v24, v24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v27, v27, v52 +; SI-NEXT: v_or_b32_e32 v26, v26, v30 +; SI-NEXT: v_or_b32_e32 v23, v23, v29 +; SI-NEXT: v_or_b32_e32 v20, v20, v28 +; SI-NEXT: v_or_b32_e32 v39, v39, v51 +; SI-NEXT: v_or_b32_e32 v36, v36, v50 +; SI-NEXT: v_or_b32_e32 v33, v33, v49 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_or_b32_e32 v11, v11, v17 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_alignbit_b32 v52, v24, v52, 16 +; SI-NEXT: v_alignbit_b32 v30, v21, v30, 16 +; SI-NEXT: v_alignbit_b32 v29, v18, v29, 16 +; SI-NEXT: v_alignbit_b32 v28, v37, v28, 16 +; SI-NEXT: v_alignbit_b32 v51, v34, v51, 16 +; SI-NEXT: v_alignbit_b32 v50, v31, v50, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v49, 16 +; SI-NEXT: v_alignbit_b32 v48, v9, v48, 16 +; SI-NEXT: v_alignbit_b32 v17, v6, v17, 16 +; SI-NEXT: v_alignbit_b32 v16, v3, v16, 16 +; SI-NEXT: v_alignbit_b32 v12, v1, v12, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v27, v27, v52 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v30 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v35 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 56, v0 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v44f16_to_v44i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v30, 0x200 +; VI-NEXT: v_add_f16_e32 v24, s16, v30 +; VI-NEXT: v_add_f16_e32 v51, s43, v30 +; VI-NEXT: v_add_f16_e32 v25, s17, v30 +; VI-NEXT: v_add_f16_e32 v50, s42, v30 +; VI-NEXT: v_add_f16_e32 v26, s18, v30 +; VI-NEXT: v_add_f16_e32 v49, s41, v30 +; VI-NEXT: v_add_f16_e32 v27, s19, v30 +; VI-NEXT: v_add_f16_e32 v48, s40, v30 +; VI-NEXT: v_add_f16_e32 v28, s20, v30 +; VI-NEXT: v_add_f16_e32 v39, s15, v30 +; VI-NEXT: v_add_f16_e32 v29, s21, v30 +; VI-NEXT: v_add_f16_e32 v38, s14, v30 +; VI-NEXT: v_add_f16_e32 v22, s22, v30 +; VI-NEXT: v_add_f16_e32 v37, s13, v30 +; VI-NEXT: v_add_f16_e32 v23, s23, v30 +; VI-NEXT: v_add_f16_e32 v36, s12, v30 +; VI-NEXT: v_add_f16_e32 v8, s24, v30 +; VI-NEXT: v_add_f16_e32 v35, s11, v30 +; VI-NEXT: v_add_f16_e32 v9, s25, v30 +; VI-NEXT: v_add_f16_e32 v34, s10, v30 +; VI-NEXT: v_add_f16_e32 v10, s26, v30 +; VI-NEXT: v_add_f16_e32 v33, s9, v30 +; VI-NEXT: v_add_f16_e32 v11, s27, v30 +; VI-NEXT: v_add_f16_e32 v32, s8, v30 +; VI-NEXT: v_add_f16_e32 v12, s28, v30 +; VI-NEXT: v_add_f16_e32 v31, s7, v30 +; VI-NEXT: v_add_f16_e32 v13, s29, v30 +; VI-NEXT: v_add_f16_e32 v30, s6, v30 +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: s_branch .LBB59_5 +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v30, s6 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v31, s7 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v32, s8 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v33, s9 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v34, s10 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v35, s11 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v36, s12 +; VI-NEXT: v_mov_b32_e32 v23, s23 +; VI-NEXT: v_mov_b32_e32 v37, s13 +; VI-NEXT: v_mov_b32_e32 v22, s22 +; VI-NEXT: v_mov_b32_e32 v38, s14 +; VI-NEXT: v_mov_b32_e32 v29, s21 +; VI-NEXT: v_mov_b32_e32 v39, s15 +; VI-NEXT: v_mov_b32_e32 v28, s20 +; VI-NEXT: v_mov_b32_e32 v48, s40 +; VI-NEXT: v_mov_b32_e32 v27, s19 +; VI-NEXT: v_mov_b32_e32 v49, s41 +; VI-NEXT: v_mov_b32_e32 v26, s18 +; VI-NEXT: v_mov_b32_e32 v50, s42 +; VI-NEXT: v_mov_b32_e32 v25, s17 +; VI-NEXT: v_mov_b32_e32 v51, s43 +; VI-NEXT: v_mov_b32_e32 v24, s16 +; VI-NEXT: .LBB59_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: v_or_b32_sdwa v24, v24, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; VI-NEXT: v_or_b32_sdwa v8, v8, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v22 +; VI-NEXT: v_mov_b32_e32 v7, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v44f16_to_v44i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_mov_b32_e32 v14, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_f16 v12, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_f16 v11, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_f16 v10, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_f16 v9, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_f16 v8, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_f16 v23, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_f16 v22, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_f16 v29, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_f16 v28, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_f16 v27, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_pk_add_f16 v26, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_pk_add_f16 v25, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_f16 v24, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: s_branch .LBB59_5 +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s23 +; GFX9-NEXT: v_mov_b32_e32 v22, s22 +; GFX9-NEXT: v_mov_b32_e32 v29, s21 +; GFX9-NEXT: v_mov_b32_e32 v28, s20 +; GFX9-NEXT: v_mov_b32_e32 v27, s19 +; GFX9-NEXT: v_mov_b32_e32 v26, s18 +; GFX9-NEXT: v_mov_b32_e32 v25, s17 +; GFX9-NEXT: v_mov_b32_e32 v24, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s43 +; GFX9-NEXT: v_mov_b32_e32 v31, s42 +; GFX9-NEXT: v_mov_b32_e32 v32, s41 +; GFX9-NEXT: v_mov_b32_e32 v33, s40 +; GFX9-NEXT: v_mov_b32_e32 v34, s15 +; GFX9-NEXT: v_mov_b32_e32 v35, s14 +; GFX9-NEXT: v_mov_b32_e32 v36, s13 +; GFX9-NEXT: v_mov_b32_e32 v37, s12 +; GFX9-NEXT: v_mov_b32_e32 v38, s11 +; GFX9-NEXT: v_mov_b32_e32 v39, s10 +; GFX9-NEXT: v_mov_b32_e32 v48, s9 +; GFX9-NEXT: v_mov_b32_e32 v49, s8 +; GFX9-NEXT: v_mov_b32_e32 v50, s7 +; GFX9-NEXT: v_mov_b32_e32 v51, s6 +; GFX9-NEXT: .LBB59_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v24, v51, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v50, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v49, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v48, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v39, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v38, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v22, v37, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v36, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v35, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v34, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v33, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v32, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v31, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v30, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v22 +; GFX9-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v44i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v7, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v5, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s14, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s13, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s12, s9 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s11, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s9, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v48.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v49.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v50.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v51.l +; GFX11-TRUE16-NEXT: s_branch .LBB59_5 +; GFX11-TRUE16-NEXT: .LBB59_3: +; GFX11-TRUE16-NEXT: s_branch .LBB59_2 +; GFX11-TRUE16-NEXT: .LBB59_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s7 +; GFX11-TRUE16-NEXT: .LBB59_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v28, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v38, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v39, 16, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v29, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v21, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v30, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v27, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v19, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v37, 16, v38 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v44i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: s_branch .LBB59_5 +; GFX11-FAKE16-NEXT: .LBB59_3: +; GFX11-FAKE16-NEXT: s_branch .LBB59_2 +; GFX11-FAKE16-NEXT: .LBB59_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v12, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s27 :: v_dual_mov_b32 v14, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s25 :: v_dual_mov_b32 v16, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s23 :: v_dual_mov_b32 v8, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s19 :: v_dual_mov_b32 v4, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v23, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s1 :: v_dual_mov_b32 v25, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s45 :: v_dual_mov_b32 v27, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s41 :: v_dual_mov_b32 v31, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s15 :: v_dual_mov_b32 v33, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s13 :: v_dual_mov_b32 v35, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s11 :: v_dual_mov_b32 v37, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s9 :: v_dual_mov_b32 v39, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s6 :: v_dual_mov_b32 v49, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s5 +; GFX11-FAKE16-NEXT: .LBB59_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v51, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v50, 16, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v49, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v48, 16, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v38, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v39, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v37, 16, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v35, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v30, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v28, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v27, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <44 x half> %a, splat (half 0xH0200) + %a2 = bitcast <44 x half> %a1 to <44 x i16> + br label %end + +cmp.false: + %a3 = bitcast <44 x half> %a to <44 x i16> + br label %end + +end: + %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x i16> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index b1a194f8a3a7d..dfd5c09f77b1d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -1,48 +1,48 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <24 x float> @bitcast_v24i32_to_v24f32(<24 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i32_to_v24f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i32_to_v24f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i32_to_v24f32: ; VI: ; %bb.0: @@ -172,43 +172,300 @@ end: ret <24 x float> %phi } +define inreg <24 x float> @bitcast_v24i32_to_v24f32_scalar(<24 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i32_to_v24f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v24i32_to_v24f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v24i32_to_v24f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v24i32_to_v24f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_3: +; GFX11-NEXT: .LBB1_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i32> %a, splat (i32 3) + %a2 = bitcast <24 x i32> %a1 to <24 x float> + br label %end + +cmp.false: + %a3 = bitcast <24 x i32> %a to <24 x float> + br label %end + +end: + %phi = phi <24 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x float> %phi +} + define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f32_to_v24i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f32_to_v24i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f32_to_v24i32: ; VI: ; %bb.0: @@ -217,7 +474,7 @@ define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -243,7 +500,7 @@ define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -254,7 +511,7 @@ define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 ; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -280,7 +537,7 @@ define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -292,7 +549,7 @@ define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 ; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 @@ -306,7 +563,7 @@ define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -326,43 +583,288 @@ end: ret <24 x i32> %phi } +define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f32_to_v24i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v24f32_to_v24i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v24f32_to_v24i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v24f32_to_v24i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB3_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: .LBB3_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <24 x float> %a1 to <24 x i32> + br label %end + +cmp.false: + %a3 = bitcast <24 x float> %a to <24 x i32> + br label %end + +end: + %phi = phi <24 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i32> %phi +} + define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i32_to_v12i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i32_to_v12i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i32_to_v12i64: ; VI: ; %bb.0: @@ -371,7 +873,7 @@ define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 @@ -397,7 +899,7 @@ define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -408,7 +910,7 @@ define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 ; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 @@ -434,7 +936,7 @@ define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -446,7 +948,7 @@ define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 ; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 @@ -472,7 +974,7 @@ define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -492,93 +994,350 @@ end: ret <12 x i64> %phi } -define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i64_to_v24i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <12 x i64> @bitcast_v24i32_to_v12i64_scalar(<24 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i32_to_v12i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 ; -; VI-LABEL: bitcast_v12i64_to_v24i32: +; VI-LABEL: bitcast_v24i32_to_v12i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB5_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 ; -; GFX9-LABEL: bitcast_v12i64_to_v24i32: +; GFX9-LABEL: bitcast_v24i32_to_v12i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v24i32_to_v12i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_3: +; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i32> %a, splat (i32 3) + %a2 = bitcast <24 x i32> %a1 to <12 x i64> + br label %end + +cmp.false: + %a3 = bitcast <24 x i32> %a to <12 x i64> + br label %end + +end: + %phi = phi <12 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i64> %phi +} + +define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v12i64_to_v24i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12i64_to_v24i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12i64_to_v24i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc @@ -600,7 +1359,7 @@ define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -612,7 +1371,7 @@ define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -644,7 +1403,7 @@ define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -664,43 +1423,306 @@ end: ret <24 x i32> %phi } +define inreg <24 x i32> @bitcast_v12i64_to_v24i32_scalar(<12 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i64_to_v24i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v12i64_to_v24i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v12i64_to_v24i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v12i64_to_v24i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB7_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: .LBB7_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i64> %a, splat (i64 3) + %a2 = bitcast <12 x i64> %a1 to <24 x i32> + br label %end + +cmp.false: + %a3 = bitcast <12 x i64> %a to <24 x i32> + br label %end + +end: + %phi = phi <24 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i32> %phi +} + define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i32_to_v12f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i32_to_v12f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i32_to_v12f64: ; VI: ; %bb.0: @@ -709,7 +1731,7 @@ define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 @@ -735,7 +1757,7 @@ define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -746,7 +1768,7 @@ define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 ; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 @@ -772,7 +1794,7 @@ define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -784,7 +1806,7 @@ define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 ; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 @@ -810,7 +1832,7 @@ define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -830,54 +1852,311 @@ end: ret <12 x double> %phi } -define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f64_to_v24i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <12 x double> @bitcast_v24i32_to_v12f64_scalar(<24 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i32_to_v12f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 ; -; VI-LABEL: bitcast_v12f64_to_v24i32: +; VI-LABEL: bitcast_v24i32_to_v12f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v24i32_to_v12f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v24i32_to_v12f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB9_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: .LBB9_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i32> %a, splat (i32 3) + %a2 = bitcast <24 x i32> %a1 to <12 x double> + br label %end + +cmp.false: + %a3 = bitcast <24 x i32> %a to <12 x double> + br label %end + +end: + %phi = phi <12 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x double> %phi +} + +define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v12f64_to_v24i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f64_to_v24i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -888,7 +2167,7 @@ define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -902,7 +2181,7 @@ define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -914,7 +2193,7 @@ define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -928,7 +2207,7 @@ define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -948,269 +2227,474 @@ end: ret <24 x i32> %phi } +define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f64_to_v24i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v12f64_to_v24i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v12f64_to_v24i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v12f64_to_v24i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB11_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: .LBB11_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <12 x double> %a1 to <24 x i32> + br label %end + +cmp.false: + %a3 = bitcast <12 x double> %a to <24 x i32> + br label %end + +end: + %phi = phi <24 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i32> %phi +} + define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i32_to_v48i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v30, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v51, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v30, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v51, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v53, v41, v53 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v40, v42, v40 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v3, v3, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v4, v4, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v5, v5, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v6, v6, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v7, v7, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v8, v8, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v9, v9, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v10, v10, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v11, v11, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v12, v12, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_or_b32_e32 v13, v13, v32 -; GCN-NEXT: v_or_b32_e32 v14, v14, v39 -; GCN-NEXT: v_or_b32_e32 v15, v15, v30 -; GCN-NEXT: v_or_b32_e32 v16, v16, v37 -; GCN-NEXT: v_or_b32_e32 v17, v17, v28 -; GCN-NEXT: v_or_b32_e32 v18, v18, v35 -; GCN-NEXT: v_or_b32_e32 v19, v19, v27 -; GCN-NEXT: v_or_b32_e32 v20, v20, v33 -; GCN-NEXT: v_or_b32_e32 v21, v21, v26 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_or_b32_e32 v23, v23, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v29 -; GCN-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i32_to_v48i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v36, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v36, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i32_to_v48i16: ; VI: ; %bb.0: @@ -1242,7 +2726,7 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -1268,9 +2752,9 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB6_2: ; %Flow +; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_4 +; VI-NEXT: s_cbranch_execz .LBB12_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 @@ -1320,7 +2804,7 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB6_4: ; %end +; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 @@ -1402,7 +2886,7 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -1428,9 +2912,9 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB6_2: ; %Flow +; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 ; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 @@ -1480,7 +2964,7 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB6_4: ; %end +; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v55, v0, s4 @@ -1517,7 +3001,7 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 @@ -1543,7 +3027,7 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: .LBB12_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1578,7 +3062,7 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -1604,9 +3088,9 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 @@ -1656,7 +3140,7 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 @@ -1701,419 +3185,1413 @@ end: ret <48 x i16> %phi } +define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i32_to_v48i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: v_readfirstlane_b32 s10, v6 +; SI-NEXT: v_readfirstlane_b32 s9, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v8 +; SI-NEXT: v_readfirstlane_b32 s7, v9 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_mov_b32_e32 v7, s26 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s18 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s29, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s27, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s25, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s23, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s21, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s19, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s17, v12, 16 +; SI-NEXT: s_lshr_b32 s40, s6, 16 +; SI-NEXT: s_lshr_b32 s41, s8, 16 +; SI-NEXT: s_lshr_b32 s42, s10, 16 +; SI-NEXT: s_lshr_b32 s43, s12, 16 +; SI-NEXT: s_lshr_b32 s44, s14, 16 +; SI-NEXT: s_lshr_b32 s45, s29, 16 +; SI-NEXT: s_lshr_b32 s46, s27, 16 +; SI-NEXT: s_lshr_b32 s47, s25, 16 +; SI-NEXT: s_lshr_b32 s56, s23, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 16 +; SI-NEXT: s_lshr_b32 s58, s19, 16 +; SI-NEXT: s_lshr_b32 s59, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_mov_b32_e32 v7, s26 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s18 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s29, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s27, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s25, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s23, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s21, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s19, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s17, v12, 16 +; SI-NEXT: s_lshr_b32 s40, s6, 16 +; SI-NEXT: s_lshr_b32 s41, s8, 16 +; SI-NEXT: s_lshr_b32 s42, s10, 16 +; SI-NEXT: s_lshr_b32 s43, s12, 16 +; SI-NEXT: s_lshr_b32 s44, s14, 16 +; SI-NEXT: s_lshr_b32 s45, s29, 16 +; SI-NEXT: s_lshr_b32 s46, s27, 16 +; SI-NEXT: s_lshr_b32 s47, s25, 16 +; SI-NEXT: s_lshr_b32 s56, s23, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 16 +; SI-NEXT: s_lshr_b32 s58, s19, 16 +; SI-NEXT: s_lshr_b32 s59, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v24i32_to_v48i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v0 +; VI-NEXT: v_readfirstlane_b32 s14, v1 +; VI-NEXT: v_readfirstlane_b32 s13, v2 +; VI-NEXT: v_readfirstlane_b32 s12, v3 +; VI-NEXT: v_readfirstlane_b32 s11, v4 +; VI-NEXT: v_readfirstlane_b32 s10, v5 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s8, v7 +; VI-NEXT: v_readfirstlane_b32 s6, v8 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v9 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s40, s7, 16 +; VI-NEXT: s_lshr_b32 s41, s6, 16 +; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s43, s9, 16 +; VI-NEXT: s_lshr_b32 s44, s10, 16 +; VI-NEXT: s_lshr_b32 s45, s11, 16 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: s_lshr_b32 s56, s14, 16 +; VI-NEXT: s_lshr_b32 s57, s15, 16 +; VI-NEXT: s_lshr_b32 s58, s29, 16 +; VI-NEXT: s_lshr_b32 s59, s28, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 16 +; VI-NEXT: s_lshr_b32 s61, s26, 16 +; VI-NEXT: s_lshr_b32 s62, s25, 16 +; VI-NEXT: s_lshr_b32 s63, s24, 16 +; VI-NEXT: s_lshr_b32 s72, s23, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 16 +; VI-NEXT: s_lshr_b32 s74, s21, 16 +; VI-NEXT: s_lshr_b32 s75, s20, 16 +; VI-NEXT: s_lshr_b32 s76, s19, 16 +; VI-NEXT: s_lshr_b32 s77, s18, 16 +; VI-NEXT: s_lshr_b32 s78, s17, 16 +; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s40, s7, 16 +; VI-NEXT: s_lshr_b32 s41, s6, 16 +; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s43, s9, 16 +; VI-NEXT: s_lshr_b32 s44, s10, 16 +; VI-NEXT: s_lshr_b32 s45, s11, 16 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: s_lshr_b32 s56, s14, 16 +; VI-NEXT: s_lshr_b32 s57, s15, 16 +; VI-NEXT: s_lshr_b32 s58, s29, 16 +; VI-NEXT: s_lshr_b32 s59, s28, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 16 +; VI-NEXT: s_lshr_b32 s61, s26, 16 +; VI-NEXT: s_lshr_b32 s62, s25, 16 +; VI-NEXT: s_lshr_b32 s63, s24, 16 +; VI-NEXT: s_lshr_b32 s72, s23, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 16 +; VI-NEXT: s_lshr_b32 s74, s21, 16 +; VI-NEXT: s_lshr_b32 s75, s20, 16 +; VI-NEXT: s_lshr_b32 s76, s19, 16 +; VI-NEXT: s_lshr_b32 s77, s18, 16 +; VI-NEXT: s_lshr_b32 s78, s17, 16 +; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s79, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s78, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s77, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s76, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s75, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s74, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s73, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s72, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s63, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s62, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s61, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s60, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s59, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s58, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s28, s57, 16 +; VI-NEXT: s_or_b32 s15, s15, s28 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s28, s56, 16 +; VI-NEXT: s_or_b32 s14, s14, s28 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s28, s47, 16 +; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s28, s46, 16 +; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s45, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s44, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s28, s43, 16 +; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_or_b32 s7, s7, s28 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s15 +; VI-NEXT: v_mov_b32_e32 v15, s14 +; VI-NEXT: v_mov_b32_e32 v16, s13 +; VI-NEXT: v_mov_b32_e32 v17, s12 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_mov_b32_e32 v19, s10 +; VI-NEXT: v_mov_b32_e32 v20, s9 +; VI-NEXT: v_mov_b32_e32 v21, s8 +; VI-NEXT: v_mov_b32_e32 v22, s6 +; VI-NEXT: v_mov_b32_e32 v23, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v24i32_to_v48i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s40, s15, 16 +; GFX9-NEXT: s_lshr_b32 s41, s14, 16 +; GFX9-NEXT: s_lshr_b32 s42, s13, 16 +; GFX9-NEXT: s_lshr_b32 s43, s12, 16 +; GFX9-NEXT: s_lshr_b32 s44, s11, 16 +; GFX9-NEXT: s_lshr_b32 s45, s10, 16 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_lshr_b32 s47, s8, 16 +; GFX9-NEXT: s_lshr_b32 s56, s7, 16 +; GFX9-NEXT: s_lshr_b32 s57, s6, 16 +; GFX9-NEXT: s_lshr_b32 s58, s29, 16 +; GFX9-NEXT: s_lshr_b32 s59, s28, 16 +; GFX9-NEXT: s_lshr_b32 s60, s27, 16 +; GFX9-NEXT: s_lshr_b32 s61, s26, 16 +; GFX9-NEXT: s_lshr_b32 s62, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s24, 16 +; GFX9-NEXT: s_lshr_b32 s72, s23, 16 +; GFX9-NEXT: s_lshr_b32 s73, s22, 16 +; GFX9-NEXT: s_lshr_b32 s74, s21, 16 +; GFX9-NEXT: s_lshr_b32 s75, s20, 16 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s77, s18, 16 +; GFX9-NEXT: s_lshr_b32 s78, s17, 16 +; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s40, s15, 16 +; GFX9-NEXT: s_lshr_b32 s41, s14, 16 +; GFX9-NEXT: s_lshr_b32 s42, s13, 16 +; GFX9-NEXT: s_lshr_b32 s43, s12, 16 +; GFX9-NEXT: s_lshr_b32 s44, s11, 16 +; GFX9-NEXT: s_lshr_b32 s45, s10, 16 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_lshr_b32 s47, s8, 16 +; GFX9-NEXT: s_lshr_b32 s56, s7, 16 +; GFX9-NEXT: s_lshr_b32 s57, s6, 16 +; GFX9-NEXT: s_lshr_b32 s58, s29, 16 +; GFX9-NEXT: s_lshr_b32 s59, s28, 16 +; GFX9-NEXT: s_lshr_b32 s60, s27, 16 +; GFX9-NEXT: s_lshr_b32 s61, s26, 16 +; GFX9-NEXT: s_lshr_b32 s62, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s24, 16 +; GFX9-NEXT: s_lshr_b32 s72, s23, 16 +; GFX9-NEXT: s_lshr_b32 s73, s22, 16 +; GFX9-NEXT: s_lshr_b32 s74, s21, 16 +; GFX9-NEXT: s_lshr_b32 s75, s20, 16 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s77, s18, 16 +; GFX9-NEXT: s_lshr_b32 s78, s17, 16 +; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s40 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-TRUE16-LABEL: bitcast_v24i32_to_v48i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v5 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s9 :: v_dual_mov_b32 v19, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v23, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 +; +; GFX11-FAKE16-LABEL: bitcast_v24i32_to_v48i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-FAKE16-NEXT: s_mov_b32 s74, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB13_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i32> %a, splat (i32 3) + %a2 = bitcast <24 x i32> %a1 to <48 x i16> + br label %end + +cmp.false: + %a3 = bitcast <24 x i32> %a to <48 x i16> + br label %end + +end: + %phi = phi <48 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x i16> %phi +} + define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v48i16_to_v24i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v59 -; GCN-NEXT: v_or_b32_e32 v1, v1, v60 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v58 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v56 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v46 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v45 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v44 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v43 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v41 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v62 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v61 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v57 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v63 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v24 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: .LBB7_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v63 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v0, v59, v0 -; GCN-NEXT: v_or_b32_e32 v1, v60, v1 -; GCN-NEXT: v_or_b32_e32 v2, v58, v2 -; GCN-NEXT: v_or_b32_e32 v3, v56, v3 -; GCN-NEXT: v_or_b32_e32 v4, v47, v4 -; GCN-NEXT: v_or_b32_e32 v5, v46, v5 -; GCN-NEXT: v_or_b32_e32 v6, v45, v6 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v8, v43, v8 -; GCN-NEXT: v_or_b32_e32 v9, v41, v9 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v24, v10 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v24, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v15 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v24, v16 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v24, v17 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v24, v18 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v24, v19 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v24, v20 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v24, v21 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v24, v22 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: .LBB7_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48i16_to_v24i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v46 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v45 +; SI-NEXT: v_or_b32_e32 v5, v5, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v42 +; SI-NEXT: v_or_b32_e32 v11, v11, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v33 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v32 +; SI-NEXT: v_or_b32_e32 v16, v16, v63 +; SI-NEXT: v_or_b32_e32 v17, v17, v62 +; SI-NEXT: v_or_b32_e32 v18, v18, v61 +; SI-NEXT: v_or_b32_e32 v19, v19, v60 +; SI-NEXT: v_or_b32_e32 v20, v20, v59 +; SI-NEXT: v_or_b32_e32 v21, v21, v58 +; SI-NEXT: v_or_b32_e32 v22, v22, v57 +; SI-NEXT: v_or_b32_e32 v23, v23, v56 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v2, v46, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v45, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v44, v6 +; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v42, v10 +; SI-NEXT: v_or_b32_e32 v11, v34, v11 +; SI-NEXT: v_or_b32_e32 v12, v41, v12 +; SI-NEXT: v_or_b32_e32 v13, v33, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_or_b32_e32 v16, v63, v16 +; SI-NEXT: v_or_b32_e32 v17, v62, v17 +; SI-NEXT: v_or_b32_e32 v18, v61, v18 +; SI-NEXT: v_or_b32_e32 v19, v60, v19 +; SI-NEXT: v_or_b32_e32 v20, v59, v20 +; SI-NEXT: v_or_b32_e32 v21, v58, v21 +; SI-NEXT: v_or_b32_e32 v22, v57, v22 +; SI-NEXT: v_or_b32_e32 v23, v56, v23 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48i16_to_v24i32: ; VI: ; %bb.0: @@ -2154,7 +4632,7 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v23, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v23, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2229,9 +4707,9 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_4 +; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v23, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v47 @@ -2306,7 +4784,7 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v24, 3, v32 ; VI-NEXT: v_add_u16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: .LBB7_4: ; %end +; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2405,7 +4883,7 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload @@ -2513,9 +4991,9 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; kill: killed $vgpr24 -; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -2597,7 +5075,7 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_4: ; %end +; GFX9-NEXT: .LBB14_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2626,7 +5104,7 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2652,7 +5130,7 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2712,7 +5190,7 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2738,7 +5216,7 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2758,543 +5236,1540 @@ end: ret <24 x i32> %phi } -define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i32_to_v48f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v27 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_mov_b32_e32 v37, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v39 -; GCN-NEXT: v_mov_b32_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v49 -; GCN-NEXT: v_mov_b32_e32 v49, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v61 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v59 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v57 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v47 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v46 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v44 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v42 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v40 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v55 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v53 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v41 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v25, v27, v25 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_or_b32_e32 v37, v39, v38 -; GCN-NEXT: v_or_b32_e32 v38, v49, v48 -; GCN-NEXT: v_or_b32_e32 v39, v51, v50 -; GCN-NEXT: v_or_b32_e32 v48, v54, v53 -; GCN-NEXT: v_or_b32_e32 v49, v55, v52 -; GCN-NEXT: v_or_b32_e32 v50, v41, v40 -; GCN-NEXT: v_or_b32_e32 v51, v43, v42 -; GCN-NEXT: v_or_b32_e32 v52, v45, v44 -; GCN-NEXT: v_or_b32_e32 v53, v47, v46 -; GCN-NEXT: v_or_b32_e32 v54, v57, v56 -; GCN-NEXT: v_or_b32_e32 v55, v59, v58 -; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48i16_to_v24i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v12, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v13, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v15, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v22, v0, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v58 +; SI-NEXT: v_or_b32_e32 v23, v0, v34 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: v_mov_b32_e32 v43, v34 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v25 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v46, v27 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v56, v32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v39, v37 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_mov_b32_e32 v48, v38 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v36, v24 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v57, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v26, v35 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v25, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v57 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v41, v42 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v24, v36 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v48, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: v_mov_b32_e32 v32, v56 +; SI-NEXT: v_mov_b32_e32 v33, v47 +; SI-NEXT: v_mov_b32_e32 v27, v46 +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: s_branch .LBB15_2 ; -; VI-LABEL: bitcast_v24i32_to_v48f16: +; VI-LABEL: bitcast_v48i16_to_v24i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v49, v9 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_mov_b32_e32 v35, v7 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v39, v1 +; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_or_b32_sdwa v14, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_or_b32_sdwa v15, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v18, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v19, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_or_b32_sdwa v20, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v21, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_mov_b32_e32 v4, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v39 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v5 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v37 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v7, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v38 +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v5 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v0, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v4, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v5 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v7 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v4 +; VI-NEXT: v_or_b32_sdwa v4, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v4 +; VI-NEXT: v_or_b32_sdwa v4, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v4 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v48i16_to_v24i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v9 +; GFX9-NEXT: v_mov_b32_e32 v33, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v7 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: v_mov_b32_e32 v36, v5 +; GFX9-NEXT: v_mov_b32_e32 v37, v4 +; GFX9-NEXT: v_mov_b32_e32 v38, v3 +; GFX9-NEXT: v_mov_b32_e32 v39, v2 +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_mov_b32_e32 v49, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v49 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v23 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v24i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v3 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; +; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v24i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <48 x i16> %a, splat (i16 3) + %a2 = bitcast <48 x i16> %a1 to <24 x i32> + br label %end + +cmp.false: + %a3 = bitcast <48 x i16> %a to <24 x i32> + br label %end + +end: + %phi = phi <24 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i32> %phi +} + +define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v24i32_to_v48f16: +; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v24i32_to_v48f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -3315,7 +6790,7 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -3341,9 +6816,9 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB8_2: ; %Flow +; VI-NEXT: .LBB16_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_4 +; VI-NEXT: s_cbranch_execz .LBB16_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 @@ -3393,7 +6868,7 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB8_4: ; %end +; VI-NEXT: .LBB16_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 @@ -3475,7 +6950,7 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -3501,9 +6976,9 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB8_2: ; %Flow +; GFX9-NEXT: .LBB16_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: s_cbranch_execz .LBB16_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 ; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 @@ -3553,7 +7028,7 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB8_4: ; %end +; GFX9-NEXT: .LBB16_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v55, v0, s4 @@ -3590,7 +7065,7 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 @@ -3616,7 +7091,7 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3651,7 +7126,7 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -3677,9 +7152,9 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 @@ -3729,7 +7204,7 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 @@ -3774,538 +7249,1700 @@ end: ret <48 x half> %phi } +define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i32_to_v48f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: v_readfirstlane_b32 s10, v6 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: v_readfirstlane_b32 s7, v8 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s18, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s20, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s22, 16 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s24, 16 +; SI-NEXT: s_lshr_b32 s47, s25, 16 +; SI-NEXT: s_lshr_b32 s56, s26, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_lshr_b32 s58, s28, 16 +; SI-NEXT: s_lshr_b32 s59, s29, 16 +; SI-NEXT: s_lshr_b32 s60, s15, 16 +; SI-NEXT: s_lshr_b32 s61, s14, 16 +; SI-NEXT: s_lshr_b32 s62, s13, 16 +; SI-NEXT: s_lshr_b32 s63, s12, 16 +; SI-NEXT: s_lshr_b32 s72, s11, 16 +; SI-NEXT: s_lshr_b32 s73, s10, 16 +; SI-NEXT: s_lshr_b32 s74, s8, 16 +; SI-NEXT: s_lshr_b32 s75, s7, 16 +; SI-NEXT: s_lshr_b32 s76, s6, 16 +; SI-NEXT: s_lshr_b32 s77, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v55, v55, v40 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v53, v53, v54 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v24i32_to_v48f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v0 +; VI-NEXT: v_readfirstlane_b32 s14, v1 +; VI-NEXT: v_readfirstlane_b32 s13, v2 +; VI-NEXT: v_readfirstlane_b32 s12, v3 +; VI-NEXT: v_readfirstlane_b32 s11, v4 +; VI-NEXT: v_readfirstlane_b32 s10, v5 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s8, v7 +; VI-NEXT: v_readfirstlane_b32 s6, v8 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v9 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s40, s7, 16 +; VI-NEXT: s_lshr_b32 s41, s6, 16 +; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s43, s9, 16 +; VI-NEXT: s_lshr_b32 s44, s10, 16 +; VI-NEXT: s_lshr_b32 s45, s11, 16 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: s_lshr_b32 s56, s14, 16 +; VI-NEXT: s_lshr_b32 s57, s15, 16 +; VI-NEXT: s_lshr_b32 s58, s29, 16 +; VI-NEXT: s_lshr_b32 s59, s28, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 16 +; VI-NEXT: s_lshr_b32 s61, s26, 16 +; VI-NEXT: s_lshr_b32 s62, s25, 16 +; VI-NEXT: s_lshr_b32 s63, s24, 16 +; VI-NEXT: s_lshr_b32 s72, s23, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 16 +; VI-NEXT: s_lshr_b32 s74, s21, 16 +; VI-NEXT: s_lshr_b32 s75, s20, 16 +; VI-NEXT: s_lshr_b32 s76, s19, 16 +; VI-NEXT: s_lshr_b32 s77, s18, 16 +; VI-NEXT: s_lshr_b32 s78, s17, 16 +; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s40, s7, 16 +; VI-NEXT: s_lshr_b32 s41, s6, 16 +; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s43, s9, 16 +; VI-NEXT: s_lshr_b32 s44, s10, 16 +; VI-NEXT: s_lshr_b32 s45, s11, 16 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: s_lshr_b32 s56, s14, 16 +; VI-NEXT: s_lshr_b32 s57, s15, 16 +; VI-NEXT: s_lshr_b32 s58, s29, 16 +; VI-NEXT: s_lshr_b32 s59, s28, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 16 +; VI-NEXT: s_lshr_b32 s61, s26, 16 +; VI-NEXT: s_lshr_b32 s62, s25, 16 +; VI-NEXT: s_lshr_b32 s63, s24, 16 +; VI-NEXT: s_lshr_b32 s72, s23, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 16 +; VI-NEXT: s_lshr_b32 s74, s21, 16 +; VI-NEXT: s_lshr_b32 s75, s20, 16 +; VI-NEXT: s_lshr_b32 s76, s19, 16 +; VI-NEXT: s_lshr_b32 s77, s18, 16 +; VI-NEXT: s_lshr_b32 s78, s17, 16 +; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s79, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s78, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s77, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s76, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s75, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s74, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s73, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s72, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s63, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s62, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s61, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s60, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s59, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s58, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s28, s57, 16 +; VI-NEXT: s_or_b32 s15, s15, s28 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s28, s56, 16 +; VI-NEXT: s_or_b32 s14, s14, s28 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s28, s47, 16 +; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s28, s46, 16 +; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s45, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s44, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s28, s43, 16 +; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_or_b32 s7, s7, s28 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s15 +; VI-NEXT: v_mov_b32_e32 v15, s14 +; VI-NEXT: v_mov_b32_e32 v16, s13 +; VI-NEXT: v_mov_b32_e32 v17, s12 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_mov_b32_e32 v19, s10 +; VI-NEXT: v_mov_b32_e32 v20, s9 +; VI-NEXT: v_mov_b32_e32 v21, s8 +; VI-NEXT: v_mov_b32_e32 v22, s6 +; VI-NEXT: v_mov_b32_e32 v23, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v24i32_to_v48f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s40, s15, 16 +; GFX9-NEXT: s_lshr_b32 s41, s14, 16 +; GFX9-NEXT: s_lshr_b32 s42, s13, 16 +; GFX9-NEXT: s_lshr_b32 s43, s12, 16 +; GFX9-NEXT: s_lshr_b32 s44, s11, 16 +; GFX9-NEXT: s_lshr_b32 s45, s10, 16 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_lshr_b32 s47, s8, 16 +; GFX9-NEXT: s_lshr_b32 s56, s7, 16 +; GFX9-NEXT: s_lshr_b32 s57, s6, 16 +; GFX9-NEXT: s_lshr_b32 s58, s29, 16 +; GFX9-NEXT: s_lshr_b32 s59, s28, 16 +; GFX9-NEXT: s_lshr_b32 s60, s27, 16 +; GFX9-NEXT: s_lshr_b32 s61, s26, 16 +; GFX9-NEXT: s_lshr_b32 s62, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s24, 16 +; GFX9-NEXT: s_lshr_b32 s72, s23, 16 +; GFX9-NEXT: s_lshr_b32 s73, s22, 16 +; GFX9-NEXT: s_lshr_b32 s74, s21, 16 +; GFX9-NEXT: s_lshr_b32 s75, s20, 16 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s77, s18, 16 +; GFX9-NEXT: s_lshr_b32 s78, s17, 16 +; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s40, s15, 16 +; GFX9-NEXT: s_lshr_b32 s41, s14, 16 +; GFX9-NEXT: s_lshr_b32 s42, s13, 16 +; GFX9-NEXT: s_lshr_b32 s43, s12, 16 +; GFX9-NEXT: s_lshr_b32 s44, s11, 16 +; GFX9-NEXT: s_lshr_b32 s45, s10, 16 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_lshr_b32 s47, s8, 16 +; GFX9-NEXT: s_lshr_b32 s56, s7, 16 +; GFX9-NEXT: s_lshr_b32 s57, s6, 16 +; GFX9-NEXT: s_lshr_b32 s58, s29, 16 +; GFX9-NEXT: s_lshr_b32 s59, s28, 16 +; GFX9-NEXT: s_lshr_b32 s60, s27, 16 +; GFX9-NEXT: s_lshr_b32 s61, s26, 16 +; GFX9-NEXT: s_lshr_b32 s62, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s24, 16 +; GFX9-NEXT: s_lshr_b32 s72, s23, 16 +; GFX9-NEXT: s_lshr_b32 s73, s22, 16 +; GFX9-NEXT: s_lshr_b32 s74, s21, 16 +; GFX9-NEXT: s_lshr_b32 s75, s20, 16 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s77, s18, 16 +; GFX9-NEXT: s_lshr_b32 s78, s17, 16 +; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s40 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-TRUE16-LABEL: bitcast_v24i32_to_v48f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v5 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-TRUE16-NEXT: .LBB17_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s9 :: v_dual_mov_b32 v19, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v23, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB17_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB17_2 +; +; GFX11-FAKE16-LABEL: bitcast_v24i32_to_v48f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-FAKE16-NEXT: s_mov_b32 s74, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-FAKE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-FAKE16-NEXT: .LBB17_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB17_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i32> %a, splat (i32 3) + %a2 = bitcast <24 x i32> %a1 to <48 x half> + br label %end + +cmp.false: + %a3 = bitcast <24 x i32> %a to <48 x half> + br label %end + +end: + %phi = phi <48 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x half> %phi +} + define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v48f16_to_v24i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v52, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; GCN-NEXT: v_or_b32_e32 v2, v50, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; GCN-NEXT: v_or_b32_e32 v4, v38, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; GCN-NEXT: v_or_b32_e32 v5, v36, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_or_b32_e32 v6, v34, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v24, v10 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v24, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v15 -; GCN-NEXT: v_or_b32_e32 v16, v61, v16 -; GCN-NEXT: v_or_b32_e32 v17, v59, v17 -; GCN-NEXT: v_or_b32_e32 v18, v57, v18 -; GCN-NEXT: v_or_b32_e32 v19, v47, v19 -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: v_or_b32_e32 v21, v43, v21 -; GCN-NEXT: v_or_b32_e32 v22, v41, v22 -; GCN-NEXT: v_or_b32_e32 v23, v40, v23 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v50 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v48 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v36 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v34 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v40 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_or_b32_e32 v11, v13, v12 -; GCN-NEXT: v_or_b32_e32 v12, v15, v14 -; GCN-NEXT: v_or_b32_e32 v13, v17, v16 -; GCN-NEXT: v_or_b32_e32 v14, v19, v18 -; GCN-NEXT: v_or_b32_e32 v15, v21, v20 -; GCN-NEXT: v_or_b32_e32 v16, v23, v22 -; GCN-NEXT: v_or_b32_e32 v17, v25, v24 -; GCN-NEXT: v_or_b32_e32 v18, v27, v26 -; GCN-NEXT: v_or_b32_e32 v19, v29, v28 -; GCN-NEXT: v_or_b32_e32 v20, v31, v30 -; GCN-NEXT: v_or_b32_e32 v21, v33, v32 -; GCN-NEXT: v_or_b32_e32 v22, v35, v34 -; GCN-NEXT: v_or_b32_e32 v23, v37, v36 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48f16_to_v24i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v41 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v2, v50, v2 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 +; SI-NEXT: v_or_b32_e32 v21, v44, v21 +; SI-NEXT: v_or_b32_e32 v22, v42, v22 +; SI-NEXT: v_or_b32_e32 v23, v40, v23 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_or_b32_e32 v20, v46, v20 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48f16_to_v24i32: ; VI: ; %bb.0: @@ -4346,7 +8983,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v23, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v23, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -4421,9 +9058,9 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB9_2: ; %Flow +; VI-NEXT: .LBB18_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_4 +; VI-NEXT: s_cbranch_execz .LBB18_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v23, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v47, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -4498,7 +9135,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v24, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: .LBB9_4: ; %end +; VI-NEXT: .LBB18_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -4597,7 +9234,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload @@ -4705,9 +9342,9 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; kill: killed $vgpr24 -; GFX9-NEXT: .LBB9_2: ; %Flow +; GFX9-NEXT: .LBB18_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -4790,7 +9427,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_4: ; %end +; GFX9-NEXT: .LBB18_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -4819,7 +9456,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -4845,7 +9482,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: .LBB18_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4905,7 +9542,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -4931,7 +9568,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: .LBB18_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4951,43 +9588,1169 @@ end: ret <24 x i32> %phi } +define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48f16_to_v24i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v34 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v55, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v49, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v37, v12 +; SI-NEXT: v_or_b32_e32 v13, v61, v13 +; SI-NEXT: v_or_b32_e32 v14, v33, v14 +; SI-NEXT: v_or_b32_e32 v15, v59, v15 +; SI-NEXT: v_or_b32_e32 v16, v57, v16 +; SI-NEXT: v_or_b32_e32 v17, v47, v17 +; SI-NEXT: v_or_b32_e32 v18, v45, v18 +; SI-NEXT: v_or_b32_e32 v19, v25, v19 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v21, v28, v21 +; SI-NEXT: v_or_b32_e32 v22, v26, v22 +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v41 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v28 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v46 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v32 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v59 +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v60 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v42 +; SI-NEXT: v_mov_b32_e32 v42, v26 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v55, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v53, v43 +; SI-NEXT: v_mov_b32_e32 v52, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v25 +; SI-NEXT: v_mov_b32_e32 v43, v27 +; SI-NEXT: v_mov_b32_e32 v33, v62 +; SI-NEXT: v_mov_b32_e32 v62, v57 +; SI-NEXT: v_mov_b32_e32 v57, v44 +; SI-NEXT: v_mov_b32_e32 v44, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v27, v43 +; SI-NEXT: v_mov_b32_e32 v25, v56 +; SI-NEXT: v_mov_b32_e32 v56, v61 +; SI-NEXT: v_mov_b32_e32 v61, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v52 +; SI-NEXT: v_mov_b32_e32 v43, v53 +; SI-NEXT: v_mov_b32_e32 v52, v55 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v28, v44 +; SI-NEXT: v_mov_b32_e32 v44, v57 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: v_mov_b32_e32 v41, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: v_mov_b32_e32 v59, v34 +; SI-NEXT: v_mov_b32_e32 v32, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v48f16_to_v24i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v34, v7 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v23, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v49, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v48, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v39, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v38, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v37, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v35, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v34, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v33, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v22, v24, v22 +; VI-NEXT: v_add_f16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v48f16_to_v24i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v9 +; GFX9-NEXT: v_mov_b32_e32 v33, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v7 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: v_mov_b32_e32 v36, v5 +; GFX9-NEXT: v_mov_b32_e32 v37, v4 +; GFX9-NEXT: v_mov_b32_e32 v38, v3 +; GFX9-NEXT: v_mov_b32_e32 v39, v2 +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_mov_b32_e32 v49, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v49 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v23 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v24i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v3 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB19_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB19_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB19_2 +; +; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v24i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB19_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB19_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB19_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <48 x half> %a, splat (half 0xH0200) + %a2 = bitcast <48 x half> %a1 to <24 x i32> + br label %end + +cmp.false: + %a3 = bitcast <48 x half> %a to <24 x i32> + br label %end + +end: + %phi = phi <24 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i32> %phi +} + define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f32_to_v12i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f32_to_v12i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f32_to_v12i64: ; VI: ; %bb.0: @@ -4996,7 +10759,7 @@ define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -5022,7 +10785,7 @@ define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5033,7 +10796,7 @@ define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 ; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -5059,11 +10822,410 @@ define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24f32_to_v12i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <24 x float> %a1 to <12 x i64> + br label %end + +cmp.false: + %a3 = bitcast <24 x float> %a to <12 x i64> + br label %end + +end: + %phi = phi <12 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i64> %phi +} + +define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f32_to_v12i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v24f32_to_v12i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v24f32_to_v12i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v24f32_to_v12i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB21_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: .LBB21_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <24 x float> %a1 to <12 x i64> + br label %end + +cmp.false: + %a3 = bitcast <24 x float> %a to <12 x i64> + br label %end + +end: + %phi = phi <12 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i64> %phi +} + +define <24 x float> @bitcast_v12i64_to_v24f32(<12 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v12i64_to_v24f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12i64_to_v24f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12i64_to_v24f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v24f32_to_v12i64: +; GFX11-LABEL: bitcast_v12i64_to_v24f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -5071,87 +11233,154 @@ define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <24 x float> %a1 to <12 x i64> + %a1 = add <12 x i64> %a, splat (i64 3) + %a2 = bitcast <12 x i64> %a1 to <24 x float> br label %end cmp.false: - %a3 = bitcast <24 x float> %a to <12 x i64> + %a3 = bitcast <12 x i64> %a to <24 x float> br label %end end: - %phi = phi <12 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <12 x i64> %phi + %phi = phi <24 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x float> %phi } -define <24 x float> @bitcast_v12i64_to_v24f32(<12 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i64_to_v24f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <24 x float> @bitcast_v12i64_to_v24f32_scalar(<12 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i64_to_v24f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 ; -; VI-LABEL: bitcast_v12i64_to_v24f32: +; VI-LABEL: bitcast_v12i64_to_v24f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 ; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 @@ -5176,19 +11405,44 @@ define <24 x float> @bitcast_v12i64_to_v24f32(<12 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB11_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB23_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 ; -; GFX9-LABEL: bitcast_v12i64_to_v24f32: +; GFX9-LABEL: bitcast_v12i64_to_v24f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 @@ -5213,20 +11467,39 @@ define <24 x float> @bitcast_v12i64_to_v24f32(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB11_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB23_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 ; -; GFX11-LABEL: bitcast_v12i64_to_v24f32: +; GFX11-LABEL: bitcast_v12i64_to_v24f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB23_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: .LBB23_4: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo @@ -5257,8 +11530,6 @@ define <24 x float> @bitcast_v12i64_to_v24f32(<12 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB11_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -5278,42 +11549,42 @@ end: } define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f32_to_v12f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f32_to_v12f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f32_to_v12f64: ; VI: ; %bb.0: @@ -5322,7 +11593,7 @@ define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -5348,7 +11619,7 @@ define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end +; VI-NEXT: .LBB24_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5359,7 +11630,7 @@ define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 ; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -5385,7 +11656,7 @@ define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: .LBB24_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5397,7 +11668,7 @@ define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 ; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 @@ -5411,61 +11682,473 @@ define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB12_2: ; %end +; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <24 x float> %a1 to <12 x double> + br label %end + +cmp.false: + %a3 = bitcast <24 x float> %a to <12 x double> + br label %end + +end: + %phi = phi <12 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x double> %phi +} + +define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f32_to_v12f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: s_branch .LBB25_2 +; +; VI-LABEL: bitcast_v24f32_to_v12f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v24f32_to_v12f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-LABEL: bitcast_v24f32_to_v12f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB25_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <24 x float> %a1 to <12 x double> + br label %end + +cmp.false: + %a3 = bitcast <24 x float> %a to <12 x double> + br label %end + +end: + %phi = phi <12 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x double> %phi +} + +define <24 x float> @bitcast_v12f64_to_v24f32(<12 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v12f64_to_v24f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f64_to_v24f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f64_to_v24f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f64_to_v24f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <24 x float> %a1 to <12 x double> + %a1 = fadd <12 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <12 x double> %a1 to <24 x float> br label %end cmp.false: - %a3 = bitcast <24 x float> %a to <12 x double> + %a3 = bitcast <12 x double> %a to <24 x float> br label %end end: - %phi = phi <12 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <12 x double> %phi + %phi = phi <24 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x float> %phi } -define <24 x float> @bitcast_v12f64_to_v24f32(<12 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f64_to_v24f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f64_to_v24f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_branch .LBB27_2 ; -; VI-LABEL: bitcast_v12f64_to_v24f32: +; VI-LABEL: bitcast_v12f64_to_v24f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -5478,19 +12161,44 @@ define <24 x float> @bitcast_v12f64_to_v24f32(<12 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB27_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 ; -; GFX9-LABEL: bitcast_v12f64_to_v24f32: +; GFX9-LABEL: bitcast_v12f64_to_v24f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -5503,20 +12211,39 @@ define <24 x float> @bitcast_v12f64_to_v24f32(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB27_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 ; -; GFX11-LABEL: bitcast_v12f64_to_v24f32: +; GFX11-LABEL: bitcast_v12f64_to_v24f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB27_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: .LBB27_4: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -5529,8 +12256,6 @@ define <24 x float> @bitcast_v12f64_to_v24f32(<12 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -5550,268 +12275,264 @@ end: } define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f32_to_v48i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v30, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v51, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v30, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v51, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v53, v41, v53 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v40, v42, v40 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v3, v3, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v4, v4, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v5, v5, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v6, v6, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v7, v7, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v8, v8, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v9, v9, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v10, v10, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v11, v11, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v12, v12, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_or_b32_e32 v13, v13, v32 -; GCN-NEXT: v_or_b32_e32 v14, v14, v39 -; GCN-NEXT: v_or_b32_e32 v15, v15, v30 -; GCN-NEXT: v_or_b32_e32 v16, v16, v37 -; GCN-NEXT: v_or_b32_e32 v17, v17, v28 -; GCN-NEXT: v_or_b32_e32 v18, v18, v35 -; GCN-NEXT: v_or_b32_e32 v19, v19, v27 -; GCN-NEXT: v_or_b32_e32 v20, v20, v33 -; GCN-NEXT: v_or_b32_e32 v21, v21, v26 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_or_b32_e32 v23, v23, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v29 -; GCN-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f32_to_v48i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v36, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v36, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f32_to_v48i16: ; VI: ; %bb.0: @@ -5843,7 +12564,7 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -5869,9 +12590,9 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB14_2: ; %Flow +; VI-NEXT: .LBB28_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_4 +; VI-NEXT: s_cbranch_execz .LBB28_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -5921,7 +12642,7 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB14_4: ; %end +; VI-NEXT: .LBB28_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 @@ -6003,7 +12724,7 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -6029,9 +12750,9 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB14_2: ; %Flow +; GFX9-NEXT: .LBB28_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: s_cbranch_execz .LBB28_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 ; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -6081,7 +12802,7 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB14_4: ; %end +; GFX9-NEXT: .LBB28_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v55, v0, s4 @@ -6118,7 +12839,7 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 @@ -6132,7 +12853,7 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6167,7 +12888,7 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -6193,9 +12914,9 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB28_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 @@ -6233,7 +12954,7 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: .LBB28_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 @@ -6278,419 +12999,1388 @@ end: ret <48 x i16> %phi } +define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f32_to_v48i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v26, s16 +; SI-NEXT: v_mov_b32_e32 v24, s17 +; SI-NEXT: v_mov_b32_e32 v23, s18 +; SI-NEXT: v_mov_b32_e32 v22, s19 +; SI-NEXT: v_mov_b32_e32 v20, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v17, s24 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v16, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v13, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v11, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v25, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v27, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v28, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v31, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v34, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v36, v16, v17, 16 +; SI-NEXT: v_alignbit_b32 v38, v15, v18, 16 +; SI-NEXT: v_alignbit_b32 v48, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v51, v22, v23, 16 +; SI-NEXT: v_alignbit_b32 v53, v24, v26, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_alignbit_b32 v21, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v25, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v27, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v28, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v31, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v34, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v36, v16, v17, 16 +; SI-NEXT: v_alignbit_b32 v38, v15, v18, 16 +; SI-NEXT: v_alignbit_b32 v48, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v51, v22, v23, 16 +; SI-NEXT: v_alignbit_b32 v53, v24, v26, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_or_b32_e32 v26, v26, v53 +; SI-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v40 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v24, v26, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v54 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v52 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_v24f32_to_v48i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v21, s18 +; VI-NEXT: v_mov_b32_e32 v19, s19 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v20, s26 +; VI-NEXT: v_mov_b32_e32 v18, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: s_cbranch_scc0 .LBB29_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v23, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v24 +; VI-NEXT: v_or_b32_sdwa v24, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v28, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; VI-NEXT: v_or_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_or_b32_sdwa v29, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; VI-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v30 +; VI-NEXT: v_mov_b32_e32 v1, v31 +; VI-NEXT: v_mov_b32_e32 v2, v32 +; VI-NEXT: v_mov_b32_e32 v3, v33 +; VI-NEXT: v_mov_b32_e32 v4, v24 +; VI-NEXT: v_mov_b32_e32 v5, v25 +; VI-NEXT: v_mov_b32_e32 v6, v26 +; VI-NEXT: v_mov_b32_e32 v7, v27 +; VI-NEXT: v_mov_b32_e32 v8, v28 +; VI-NEXT: v_mov_b32_e32 v9, v29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_branch .LBB29_2 +; +; GFX9-LABEL: bitcast_v24f32_to_v48i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s17 +; GFX9-NEXT: v_mov_b32_e32 v21, s18 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 +; GFX9-NEXT: v_mov_b32_e32 v11, s24 +; GFX9-NEXT: v_mov_b32_e32 v10, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v20, s26 +; GFX9-NEXT: v_mov_b32_e32 v18, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v55, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v54, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v53, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v52, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v23, v34, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-NEXT: v_mov_b32_e32 v1, v31 +; GFX9-NEXT: v_mov_b32_e32 v2, v32 +; GFX9-NEXT: v_mov_b32_e32 v3, v33 +; GFX9-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-NEXT: v_mov_b32_e32 v5, v25 +; GFX9-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-NEXT: v_mov_b32_e32 v8, v28 +; GFX9-NEXT: v_mov_b32_e32 v9, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_branch .LBB29_2 +; +; GFX11-TRUE16-LABEL: bitcast_v24f32_to_v48i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v16, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-TRUE16-NEXT: .LBB29_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v51, 16, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v49, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v39, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v36, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v54, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v53, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v52, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v48, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v38, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB29_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB29_2 +; +; GFX11-FAKE16-LABEL: bitcast_v24f32_to_v48i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v14, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-FAKE16-NEXT: .LBB29_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v51, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v49, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v39, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v48, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v54, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v53, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v37, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB29_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_branch .LBB29_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <24 x float> %a1 to <48 x i16> + br label %end + +cmp.false: + %a3 = bitcast <24 x float> %a to <48 x i16> + br label %end + +end: + %phi = phi <48 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x i16> %phi +} + define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v48i16_to_v24f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v59 -; GCN-NEXT: v_or_b32_e32 v1, v1, v60 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v58 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v56 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v46 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v45 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v44 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v43 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v41 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v62 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v61 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v57 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v63 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v24 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: .LBB15_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v63 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v0, v59, v0 -; GCN-NEXT: v_or_b32_e32 v1, v60, v1 -; GCN-NEXT: v_or_b32_e32 v2, v58, v2 -; GCN-NEXT: v_or_b32_e32 v3, v56, v3 -; GCN-NEXT: v_or_b32_e32 v4, v47, v4 -; GCN-NEXT: v_or_b32_e32 v5, v46, v5 -; GCN-NEXT: v_or_b32_e32 v6, v45, v6 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v8, v43, v8 -; GCN-NEXT: v_or_b32_e32 v9, v41, v9 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v24, v10 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v24, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v15 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v24, v16 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v24, v17 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v24, v18 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v24, v19 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v24, v20 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v24, v21 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v24, v22 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: .LBB15_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48i16_to_v24f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v46 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v45 +; SI-NEXT: v_or_b32_e32 v5, v5, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v42 +; SI-NEXT: v_or_b32_e32 v11, v11, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v33 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v32 +; SI-NEXT: v_or_b32_e32 v16, v16, v63 +; SI-NEXT: v_or_b32_e32 v17, v17, v62 +; SI-NEXT: v_or_b32_e32 v18, v18, v61 +; SI-NEXT: v_or_b32_e32 v19, v19, v60 +; SI-NEXT: v_or_b32_e32 v20, v20, v59 +; SI-NEXT: v_or_b32_e32 v21, v21, v58 +; SI-NEXT: v_or_b32_e32 v22, v22, v57 +; SI-NEXT: v_or_b32_e32 v23, v23, v56 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: .LBB30_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v2, v46, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v45, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v44, v6 +; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v42, v10 +; SI-NEXT: v_or_b32_e32 v11, v34, v11 +; SI-NEXT: v_or_b32_e32 v12, v41, v12 +; SI-NEXT: v_or_b32_e32 v13, v33, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_or_b32_e32 v16, v63, v16 +; SI-NEXT: v_or_b32_e32 v17, v62, v17 +; SI-NEXT: v_or_b32_e32 v18, v61, v18 +; SI-NEXT: v_or_b32_e32 v19, v60, v19 +; SI-NEXT: v_or_b32_e32 v20, v59, v20 +; SI-NEXT: v_or_b32_e32 v21, v58, v21 +; SI-NEXT: v_or_b32_e32 v22, v57, v22 +; SI-NEXT: v_or_b32_e32 v23, v56, v23 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: .LBB30_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48i16_to_v24f32: ; VI: ; %bb.0: @@ -6731,7 +14421,7 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v23, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v23, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -6806,9 +14496,9 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB15_2: ; %Flow +; VI-NEXT: .LBB30_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_4 +; VI-NEXT: s_cbranch_execz .LBB30_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v23, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v47 @@ -6883,7 +14573,7 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v24, 3, v32 ; VI-NEXT: v_add_u16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: .LBB15_4: ; %end +; VI-NEXT: .LBB30_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -6982,7 +14672,7 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload @@ -7090,9 +14780,9 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; kill: killed $vgpr24 -; GFX9-NEXT: .LBB15_2: ; %Flow +; GFX9-NEXT: .LBB30_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -7174,7 +14864,7 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_4: ; %end +; GFX9-NEXT: .LBB30_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -7203,7 +14893,7 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -7229,7 +14919,7 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB15_2: ; %end +; GFX11-TRUE16-NEXT: .LBB30_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7289,7 +14979,7 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -7315,9 +15005,1028 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB15_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB30_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <48 x i16> %a, splat (i16 3) + %a2 = bitcast <48 x i16> %a1 to <24 x float> + br label %end + +cmp.false: + %a3 = bitcast <48 x i16> %a to <24 x float> + br label %end + +end: + %phi = phi <24 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x float> %phi +} + +define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48i16_to_v24f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v12, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v13, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v15, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v22, v0, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v58 +; SI-NEXT: v_or_b32_e32 v23, v0, v34 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: v_mov_b32_e32 v43, v34 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v25 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v46, v27 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v56, v32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v39, v37 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_mov_b32_e32 v48, v38 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v36, v24 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v57, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v26, v35 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v25, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v57 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v41, v42 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v24, v36 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v48, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: v_mov_b32_e32 v32, v56 +; SI-NEXT: v_mov_b32_e32 v33, v47 +; SI-NEXT: v_mov_b32_e32 v27, v46 +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v48i16_to_v24f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v49, v9 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_mov_b32_e32 v35, v7 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v39, v1 +; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_or_b32_sdwa v14, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_or_b32_sdwa v15, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v18, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v19, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_or_b32_sdwa v20, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v21, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_mov_b32_e32 v4, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v39 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v5 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v37 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v7, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v38 +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v5 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v0, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v4, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v5 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v7 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v4 +; VI-NEXT: v_or_b32_sdwa v4, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v4 +; VI-NEXT: v_or_b32_sdwa v4, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v4 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v48i16_to_v24f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v9 +; GFX9-NEXT: v_mov_b32_e32 v33, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v7 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: v_mov_b32_e32 v36, v5 +; GFX9-NEXT: v_mov_b32_e32 v37, v4 +; GFX9-NEXT: v_mov_b32_e32 v38, v3 +; GFX9-NEXT: v_mov_b32_e32 v39, v2 +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_mov_b32_e32 v49, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v49 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v23 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v24f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v3 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB31_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB31_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB31_2 +; +; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v24f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB31_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB31_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB31_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7336,531 +16045,509 @@ end: } define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f32_to_v48f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v27 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_mov_b32_e32 v37, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v39 -; GCN-NEXT: v_mov_b32_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v49 -; GCN-NEXT: v_mov_b32_e32 v49, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: .LBB16_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: .LBB16_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v61 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v59 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v57 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v47 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v46 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v44 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v42 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v40 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v55 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v53 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v41 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v25, v27, v25 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_or_b32_e32 v37, v39, v38 -; GCN-NEXT: v_or_b32_e32 v38, v49, v48 -; GCN-NEXT: v_or_b32_e32 v39, v51, v50 -; GCN-NEXT: v_or_b32_e32 v48, v54, v53 -; GCN-NEXT: v_or_b32_e32 v49, v55, v52 -; GCN-NEXT: v_or_b32_e32 v50, v41, v40 -; GCN-NEXT: v_or_b32_e32 v51, v43, v42 -; GCN-NEXT: v_or_b32_e32 v52, v45, v44 -; GCN-NEXT: v_or_b32_e32 v53, v47, v46 -; GCN-NEXT: v_or_b32_e32 v54, v57, v56 -; GCN-NEXT: v_or_b32_e32 v55, v59, v58 -; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f32_to_v48f16: +; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: .LBB32_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f32_to_v48f16: ; VI: ; %bb.0: @@ -7892,7 +16579,7 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -7918,9 +16605,9 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB16_2: ; %Flow +; VI-NEXT: .LBB32_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_4 +; VI-NEXT: s_cbranch_execz .LBB32_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -7970,7 +16657,7 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB16_4: ; %end +; VI-NEXT: .LBB32_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 @@ -8052,7 +16739,7 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -8078,9 +16765,9 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB16_2: ; %Flow +; GFX9-NEXT: .LBB32_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: s_cbranch_execz .LBB32_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 ; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -8130,7 +16817,7 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB16_4: ; %end +; GFX9-NEXT: .LBB32_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v55, v0, s4 @@ -8167,7 +16854,7 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 @@ -8181,7 +16868,7 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: .LBB32_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8216,7 +16903,7 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -8242,9 +16929,9 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB32_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 @@ -8258,58 +16945,1201 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <24 x float> %a1 to <48 x half> + br label %end + +cmp.false: + %a3 = bitcast <24 x float> %a to <48 x half> + br label %end + +end: + %phi = phi <48 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x half> %phi +} + +define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f32_to_v48f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: v_readfirstlane_b32 s10, v6 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: v_readfirstlane_b32 s7, v8 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s10, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s9, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v11 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v55, v55, v40 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v53, v53, v54 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v24f32_to_v48f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v21, s18 +; VI-NEXT: v_mov_b32_e32 v19, s19 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v20, s26 +; VI-NEXT: v_mov_b32_e32 v18, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: s_cbranch_scc0 .LBB33_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v23, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v24 +; VI-NEXT: v_or_b32_sdwa v24, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v28, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; VI-NEXT: v_or_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_or_b32_sdwa v29, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; VI-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v30 +; VI-NEXT: v_mov_b32_e32 v1, v31 +; VI-NEXT: v_mov_b32_e32 v2, v32 +; VI-NEXT: v_mov_b32_e32 v3, v33 +; VI-NEXT: v_mov_b32_e32 v4, v24 +; VI-NEXT: v_mov_b32_e32 v5, v25 +; VI-NEXT: v_mov_b32_e32 v6, v26 +; VI-NEXT: v_mov_b32_e32 v7, v27 +; VI-NEXT: v_mov_b32_e32 v8, v28 +; VI-NEXT: v_mov_b32_e32 v9, v29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_branch .LBB33_2 +; +; GFX9-LABEL: bitcast_v24f32_to_v48f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s17 +; GFX9-NEXT: v_mov_b32_e32 v21, s18 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 +; GFX9-NEXT: v_mov_b32_e32 v11, s24 +; GFX9-NEXT: v_mov_b32_e32 v10, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v20, s26 +; GFX9-NEXT: v_mov_b32_e32 v18, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v55, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v54, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v53, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v52, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v23, v34, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-NEXT: v_mov_b32_e32 v1, v31 +; GFX9-NEXT: v_mov_b32_e32 v2, v32 +; GFX9-NEXT: v_mov_b32_e32 v3, v33 +; GFX9-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-NEXT: v_mov_b32_e32 v5, v25 +; GFX9-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-NEXT: v_mov_b32_e32 v8, v28 +; GFX9-NEXT: v_mov_b32_e32 v9, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_branch .LBB33_2 +; +; GFX11-TRUE16-LABEL: bitcast_v24f32_to_v48f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v16, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-TRUE16-NEXT: .LBB33_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v51, 16, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v49, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v39, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v36, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v54, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v53, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v52, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v48, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v38, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB33_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB33_2 +; +; GFX11-FAKE16-LABEL: bitcast_v24f32_to_v48f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v14, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-FAKE16-NEXT: .LBB33_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v51, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v49, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v39, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v48, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v54, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v53, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v37, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB33_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_branch .LBB33_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -8328,537 +18158,576 @@ end: } define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v48f16_to_v24f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v52, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; GCN-NEXT: v_or_b32_e32 v2, v50, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; GCN-NEXT: v_or_b32_e32 v4, v38, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; GCN-NEXT: v_or_b32_e32 v5, v36, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_or_b32_e32 v6, v34, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v24, v10 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v24, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v15 -; GCN-NEXT: v_or_b32_e32 v16, v61, v16 -; GCN-NEXT: v_or_b32_e32 v17, v59, v17 -; GCN-NEXT: v_or_b32_e32 v18, v57, v18 -; GCN-NEXT: v_or_b32_e32 v19, v47, v19 -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: v_or_b32_e32 v21, v43, v21 -; GCN-NEXT: v_or_b32_e32 v22, v41, v22 -; GCN-NEXT: v_or_b32_e32 v23, v40, v23 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: .LBB17_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v50 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v48 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v36 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v34 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v40 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_or_b32_e32 v11, v13, v12 -; GCN-NEXT: v_or_b32_e32 v12, v15, v14 -; GCN-NEXT: v_or_b32_e32 v13, v17, v16 -; GCN-NEXT: v_or_b32_e32 v14, v19, v18 -; GCN-NEXT: v_or_b32_e32 v15, v21, v20 -; GCN-NEXT: v_or_b32_e32 v16, v23, v22 -; GCN-NEXT: v_or_b32_e32 v17, v25, v24 -; GCN-NEXT: v_or_b32_e32 v18, v27, v26 -; GCN-NEXT: v_or_b32_e32 v19, v29, v28 -; GCN-NEXT: v_or_b32_e32 v20, v31, v30 -; GCN-NEXT: v_or_b32_e32 v21, v33, v32 -; GCN-NEXT: v_or_b32_e32 v22, v35, v34 -; GCN-NEXT: v_or_b32_e32 v23, v37, v36 -; GCN-NEXT: .LBB17_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48f16_to_v24f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v41 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v2, v50, v2 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 +; SI-NEXT: v_or_b32_e32 v21, v44, v21 +; SI-NEXT: v_or_b32_e32 v22, v42, v22 +; SI-NEXT: v_or_b32_e32 v23, v40, v23 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_or_b32_e32 v20, v46, v20 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48f16_to_v24f32: ; VI: ; %bb.0: @@ -8899,7 +18768,7 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v23, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v23, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -8974,9 +18843,9 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB17_2: ; %Flow +; VI-NEXT: .LBB34_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_4 +; VI-NEXT: s_cbranch_execz .LBB34_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v23, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v47, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -9051,7 +18920,7 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v24, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: .LBB17_4: ; %end +; VI-NEXT: .LBB34_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -9150,7 +19019,7 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload @@ -9258,9 +19127,9 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; kill: killed $vgpr24 -; GFX9-NEXT: .LBB17_2: ; %Flow +; GFX9-NEXT: .LBB34_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_4 +; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -9343,7 +19212,7 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_4: ; %end +; GFX9-NEXT: .LBB34_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -9372,7 +19241,7 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -9398,7 +19267,7 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9458,7 +19327,7 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -9484,73 +19353,1420 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <48 x half> %a, splat (half 0xH0200) - %a2 = bitcast <48 x half> %a1 to <24 x float> + %a1 = fadd <48 x half> %a, splat (half 0xH0200) + %a2 = bitcast <48 x half> %a1 to <24 x float> + br label %end + +cmp.false: + %a3 = bitcast <48 x half> %a to <24 x float> + br label %end + +end: + %phi = phi <24 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x float> %phi +} + +define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48f16_to_v24f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v34 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v55, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v49, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v37, v12 +; SI-NEXT: v_or_b32_e32 v13, v61, v13 +; SI-NEXT: v_or_b32_e32 v14, v33, v14 +; SI-NEXT: v_or_b32_e32 v15, v59, v15 +; SI-NEXT: v_or_b32_e32 v16, v57, v16 +; SI-NEXT: v_or_b32_e32 v17, v47, v17 +; SI-NEXT: v_or_b32_e32 v18, v45, v18 +; SI-NEXT: v_or_b32_e32 v19, v25, v19 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v21, v28, v21 +; SI-NEXT: v_or_b32_e32 v22, v26, v22 +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v41 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v28 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v46 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v32 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v59 +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v60 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v42 +; SI-NEXT: v_mov_b32_e32 v42, v26 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v55, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v53, v43 +; SI-NEXT: v_mov_b32_e32 v52, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v25 +; SI-NEXT: v_mov_b32_e32 v43, v27 +; SI-NEXT: v_mov_b32_e32 v33, v62 +; SI-NEXT: v_mov_b32_e32 v62, v57 +; SI-NEXT: v_mov_b32_e32 v57, v44 +; SI-NEXT: v_mov_b32_e32 v44, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v27, v43 +; SI-NEXT: v_mov_b32_e32 v25, v56 +; SI-NEXT: v_mov_b32_e32 v56, v61 +; SI-NEXT: v_mov_b32_e32 v61, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v52 +; SI-NEXT: v_mov_b32_e32 v43, v53 +; SI-NEXT: v_mov_b32_e32 v52, v55 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v28, v44 +; SI-NEXT: v_mov_b32_e32 v44, v57 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: v_mov_b32_e32 v41, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: v_mov_b32_e32 v59, v34 +; SI-NEXT: v_mov_b32_e32 v32, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v48f16_to_v24f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v34, v7 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v23, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v49, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v48, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v39, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v38, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v37, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v35, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v34, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v33, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v22, v24, v22 +; VI-NEXT: v_add_f16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v48f16_to_v24f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v9 +; GFX9-NEXT: v_mov_b32_e32 v33, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v7 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: v_mov_b32_e32 v36, v5 +; GFX9-NEXT: v_mov_b32_e32 v37, v4 +; GFX9-NEXT: v_mov_b32_e32 v38, v3 +; GFX9-NEXT: v_mov_b32_e32 v39, v2 +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_mov_b32_e32 v49, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v49 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v23 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v24f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v3 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB35_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB35_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; +; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v24f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB35_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB35_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB35_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <48 x half> %a, splat (half 0xH0200) + %a2 = bitcast <48 x half> %a1 to <24 x float> + br label %end + +cmp.false: + %a3 = bitcast <48 x half> %a to <24 x float> + br label %end + +end: + %phi = phi <24 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x float> %phi +} + +define <12 x double> @bitcast_v12i64_to_v12f64(<12 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v12i64_to_v12f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12i64_to_v12f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12i64_to_v12f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i64_to_v12f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i64> %a, splat (i64 3) + %a2 = bitcast <12 x i64> %a1 to <12 x double> br label %end cmp.false: - %a3 = bitcast <48 x half> %a to <24 x float> + %a3 = bitcast <12 x i64> %a to <12 x double> br label %end end: - %phi = phi <24 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <24 x float> %phi + %phi = phi <12 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x double> %phi } -define <12 x double> @bitcast_v12i64_to_v12f64(<12 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i64_to_v12f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <12 x double> @bitcast_v12i64_to_v12f64_scalar(<12 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i64_to_v12f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 ; -; VI-LABEL: bitcast_v12i64_to_v12f64: +; VI-LABEL: bitcast_v12i64_to_v12f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 @@ -9575,19 +20791,44 @@ define <12 x double> @bitcast_v12i64_to_v12f64(<12 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 ; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: .LBB18_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB37_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 ; -; GFX9-LABEL: bitcast_v12i64_to_v12f64: +; GFX9-LABEL: bitcast_v12i64_to_v12f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 @@ -9612,20 +20853,39 @@ define <12 x double> @bitcast_v12i64_to_v12f64(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: .LBB18_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB37_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 ; -; GFX11-LABEL: bitcast_v12i64_to_v12f64: +; GFX11-LABEL: bitcast_v12i64_to_v12f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB37_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: .LBB37_4: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -9656,8 +20916,6 @@ define <12 x double> @bitcast_v12i64_to_v12f64(<12 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo ; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: .LBB18_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -9677,30 +20935,30 @@ end: } define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f64_to_v12i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f64_to_v12i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f64_to_v12i64: ; VI: ; %bb.0: @@ -9709,7 +20967,7 @@ define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -9723,7 +20981,7 @@ define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9734,7 +20992,7 @@ define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -9748,7 +21006,7 @@ define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end +; GFX9-NEXT: .LBB38_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9760,7 +21018,7 @@ define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -9774,7 +21032,7 @@ define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9794,269 +21052,474 @@ end: ret <12 x i64> %phi } +define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f64_to_v12i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v12f64_to_v12i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v12f64_to_v12i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v12f64_to_v12i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB39_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: .LBB39_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <12 x double> %a1 to <12 x i64> + br label %end + +cmp.false: + %a3 = bitcast <12 x double> %a to <12 x i64> + br label %end + +end: + %phi = phi <12 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i64> %phi +} + define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i64_to_v48i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v29, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v50, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v29, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v50, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v53, v41, v53 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v40, v42, v40 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v3, v3, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v4, v4, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v5, v5, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v6, v6, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v7, v7, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v8, v8, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v9, v9, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v11, v11, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v12, v12, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_or_b32_e32 v13, v13, v32 -; GCN-NEXT: v_or_b32_e32 v14, v14, v39 -; GCN-NEXT: v_or_b32_e32 v15, v15, v29 -; GCN-NEXT: v_or_b32_e32 v16, v16, v37 -; GCN-NEXT: v_or_b32_e32 v17, v17, v28 -; GCN-NEXT: v_or_b32_e32 v18, v18, v35 -; GCN-NEXT: v_or_b32_e32 v19, v19, v27 -; GCN-NEXT: v_or_b32_e32 v20, v20, v33 -; GCN-NEXT: v_or_b32_e32 v21, v21, v26 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_or_b32_e32 v23, v23, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v30 -; GCN-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i64_to_v48i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v35, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v35, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i64_to_v48i16: ; VI: ; %bb.0: @@ -10088,7 +21551,7 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -10114,9 +21577,9 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB20_2: ; %Flow +; VI-NEXT: .LBB40_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_4 +; VI-NEXT: s_cbranch_execz .LBB40_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 ; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc @@ -10166,7 +21629,7 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB20_4: ; %end +; VI-NEXT: .LBB40_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 @@ -10248,7 +21711,7 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -10274,9 +21737,9 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB20_2: ; %Flow +; GFX9-NEXT: .LBB40_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_4 +; GFX9-NEXT: s_cbranch_execz .LBB40_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc @@ -10326,7 +21789,7 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB20_4: ; %end +; GFX9-NEXT: .LBB40_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v55, v0, s4 @@ -10363,7 +21826,7 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -10395,7 +21858,7 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: .LBB40_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10430,7 +21893,7 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -10456,9 +21919,9 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB40_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -10514,7 +21977,7 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: .LBB40_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 @@ -10559,419 +22022,1413 @@ end: ret <48 x i16> %phi } +define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i64_to_v48i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: v_readfirstlane_b32 s10, v6 +; SI-NEXT: v_readfirstlane_b32 s9, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v8 +; SI-NEXT: v_readfirstlane_b32 s7, v9 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_mov_b32_e32 v7, s26 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s18 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s29, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s27, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s25, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s23, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s21, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s19, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s17, v12, 16 +; SI-NEXT: s_lshr_b32 s40, s6, 16 +; SI-NEXT: s_lshr_b32 s41, s8, 16 +; SI-NEXT: s_lshr_b32 s42, s10, 16 +; SI-NEXT: s_lshr_b32 s43, s12, 16 +; SI-NEXT: s_lshr_b32 s44, s14, 16 +; SI-NEXT: s_lshr_b32 s45, s29, 16 +; SI-NEXT: s_lshr_b32 s46, s27, 16 +; SI-NEXT: s_lshr_b32 s47, s25, 16 +; SI-NEXT: s_lshr_b32 s56, s23, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 16 +; SI-NEXT: s_lshr_b32 s58, s19, 16 +; SI-NEXT: s_lshr_b32 s59, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s15, s15, 3 +; SI-NEXT: s_addc_u32 s14, s14, 0 +; SI-NEXT: s_add_u32 s13, s13, 3 +; SI-NEXT: s_addc_u32 s12, s12, 0 +; SI-NEXT: s_add_u32 s11, s11, 3 +; SI-NEXT: s_addc_u32 s10, s10, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_mov_b32_e32 v7, s26 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s18 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s29, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s27, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s25, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s23, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s21, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s19, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s17, v12, 16 +; SI-NEXT: s_lshr_b32 s40, s6, 16 +; SI-NEXT: s_lshr_b32 s41, s8, 16 +; SI-NEXT: s_lshr_b32 s42, s10, 16 +; SI-NEXT: s_lshr_b32 s43, s12, 16 +; SI-NEXT: s_lshr_b32 s44, s14, 16 +; SI-NEXT: s_lshr_b32 s45, s29, 16 +; SI-NEXT: s_lshr_b32 s46, s27, 16 +; SI-NEXT: s_lshr_b32 s47, s25, 16 +; SI-NEXT: s_lshr_b32 s56, s23, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 16 +; SI-NEXT: s_lshr_b32 s58, s19, 16 +; SI-NEXT: s_lshr_b32 s59, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v12i64_to_v48i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v0 +; VI-NEXT: v_readfirstlane_b32 s14, v1 +; VI-NEXT: v_readfirstlane_b32 s13, v2 +; VI-NEXT: v_readfirstlane_b32 s12, v3 +; VI-NEXT: v_readfirstlane_b32 s11, v4 +; VI-NEXT: v_readfirstlane_b32 s10, v5 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s8, v7 +; VI-NEXT: v_readfirstlane_b32 s6, v8 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v9 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s40, s7, 16 +; VI-NEXT: s_lshr_b32 s41, s6, 16 +; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s43, s9, 16 +; VI-NEXT: s_lshr_b32 s44, s10, 16 +; VI-NEXT: s_lshr_b32 s45, s11, 16 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: s_lshr_b32 s56, s14, 16 +; VI-NEXT: s_lshr_b32 s57, s15, 16 +; VI-NEXT: s_lshr_b32 s58, s29, 16 +; VI-NEXT: s_lshr_b32 s59, s28, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 16 +; VI-NEXT: s_lshr_b32 s61, s26, 16 +; VI-NEXT: s_lshr_b32 s62, s25, 16 +; VI-NEXT: s_lshr_b32 s63, s24, 16 +; VI-NEXT: s_lshr_b32 s72, s23, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 16 +; VI-NEXT: s_lshr_b32 s74, s21, 16 +; VI-NEXT: s_lshr_b32 s75, s20, 16 +; VI-NEXT: s_lshr_b32 s76, s19, 16 +; VI-NEXT: s_lshr_b32 s77, s18, 16 +; VI-NEXT: s_lshr_b32 s78, s17, 16 +; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s40, s7, 16 +; VI-NEXT: s_lshr_b32 s41, s6, 16 +; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s43, s9, 16 +; VI-NEXT: s_lshr_b32 s44, s10, 16 +; VI-NEXT: s_lshr_b32 s45, s11, 16 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: s_lshr_b32 s56, s14, 16 +; VI-NEXT: s_lshr_b32 s57, s15, 16 +; VI-NEXT: s_lshr_b32 s58, s29, 16 +; VI-NEXT: s_lshr_b32 s59, s28, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 16 +; VI-NEXT: s_lshr_b32 s61, s26, 16 +; VI-NEXT: s_lshr_b32 s62, s25, 16 +; VI-NEXT: s_lshr_b32 s63, s24, 16 +; VI-NEXT: s_lshr_b32 s72, s23, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 16 +; VI-NEXT: s_lshr_b32 s74, s21, 16 +; VI-NEXT: s_lshr_b32 s75, s20, 16 +; VI-NEXT: s_lshr_b32 s76, s19, 16 +; VI-NEXT: s_lshr_b32 s77, s18, 16 +; VI-NEXT: s_lshr_b32 s78, s17, 16 +; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s79, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s78, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s77, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s76, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s75, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s74, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s73, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s72, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s63, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s62, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s61, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s60, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s59, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s58, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s28, s57, 16 +; VI-NEXT: s_or_b32 s15, s15, s28 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s28, s56, 16 +; VI-NEXT: s_or_b32 s14, s14, s28 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s28, s47, 16 +; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s28, s46, 16 +; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s45, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s44, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s28, s43, 16 +; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_or_b32 s7, s7, s28 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s15 +; VI-NEXT: v_mov_b32_e32 v15, s14 +; VI-NEXT: v_mov_b32_e32 v16, s13 +; VI-NEXT: v_mov_b32_e32 v17, s12 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_mov_b32_e32 v19, s10 +; VI-NEXT: v_mov_b32_e32 v20, s9 +; VI-NEXT: v_mov_b32_e32 v21, s8 +; VI-NEXT: v_mov_b32_e32 v22, s6 +; VI-NEXT: v_mov_b32_e32 v23, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v12i64_to_v48i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s40, s15, 16 +; GFX9-NEXT: s_lshr_b32 s41, s14, 16 +; GFX9-NEXT: s_lshr_b32 s42, s13, 16 +; GFX9-NEXT: s_lshr_b32 s43, s12, 16 +; GFX9-NEXT: s_lshr_b32 s44, s11, 16 +; GFX9-NEXT: s_lshr_b32 s45, s10, 16 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_lshr_b32 s47, s8, 16 +; GFX9-NEXT: s_lshr_b32 s56, s7, 16 +; GFX9-NEXT: s_lshr_b32 s57, s6, 16 +; GFX9-NEXT: s_lshr_b32 s58, s29, 16 +; GFX9-NEXT: s_lshr_b32 s59, s28, 16 +; GFX9-NEXT: s_lshr_b32 s60, s27, 16 +; GFX9-NEXT: s_lshr_b32 s61, s26, 16 +; GFX9-NEXT: s_lshr_b32 s62, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s24, 16 +; GFX9-NEXT: s_lshr_b32 s72, s23, 16 +; GFX9-NEXT: s_lshr_b32 s73, s22, 16 +; GFX9-NEXT: s_lshr_b32 s74, s21, 16 +; GFX9-NEXT: s_lshr_b32 s75, s20, 16 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s77, s18, 16 +; GFX9-NEXT: s_lshr_b32 s78, s17, 16 +; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s40, s15, 16 +; GFX9-NEXT: s_lshr_b32 s41, s14, 16 +; GFX9-NEXT: s_lshr_b32 s42, s13, 16 +; GFX9-NEXT: s_lshr_b32 s43, s12, 16 +; GFX9-NEXT: s_lshr_b32 s44, s11, 16 +; GFX9-NEXT: s_lshr_b32 s45, s10, 16 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_lshr_b32 s47, s8, 16 +; GFX9-NEXT: s_lshr_b32 s56, s7, 16 +; GFX9-NEXT: s_lshr_b32 s57, s6, 16 +; GFX9-NEXT: s_lshr_b32 s58, s29, 16 +; GFX9-NEXT: s_lshr_b32 s59, s28, 16 +; GFX9-NEXT: s_lshr_b32 s60, s27, 16 +; GFX9-NEXT: s_lshr_b32 s61, s26, 16 +; GFX9-NEXT: s_lshr_b32 s62, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s24, 16 +; GFX9-NEXT: s_lshr_b32 s72, s23, 16 +; GFX9-NEXT: s_lshr_b32 s73, s22, 16 +; GFX9-NEXT: s_lshr_b32 s74, s21, 16 +; GFX9-NEXT: s_lshr_b32 s75, s20, 16 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s77, s18, 16 +; GFX9-NEXT: s_lshr_b32 s78, s17, 16 +; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s40 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-TRUE16-LABEL: bitcast_v12i64_to_v48i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v5 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-TRUE16-NEXT: .LBB41_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s9 :: v_dual_mov_b32 v19, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v23, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB41_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB41_2 +; +; GFX11-FAKE16-LABEL: bitcast_v12i64_to_v48i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-FAKE16-NEXT: s_mov_b32 s74, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-FAKE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-FAKE16-NEXT: .LBB41_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB41_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: s_branch .LBB41_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i64> %a, splat (i64 3) + %a2 = bitcast <12 x i64> %a1 to <48 x i16> + br label %end + +cmp.false: + %a3 = bitcast <12 x i64> %a to <48 x i16> + br label %end + +end: + %phi = phi <48 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x i16> %phi +} + define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v48i16_to_v12i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v59 -; GCN-NEXT: v_or_b32_e32 v1, v1, v60 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v58 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v56 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v46 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v45 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v44 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v43 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v41 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v62 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v61 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v57 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v63 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v24 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v63 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v0, v59, v0 -; GCN-NEXT: v_or_b32_e32 v1, v60, v1 -; GCN-NEXT: v_or_b32_e32 v2, v58, v2 -; GCN-NEXT: v_or_b32_e32 v3, v56, v3 -; GCN-NEXT: v_or_b32_e32 v4, v47, v4 -; GCN-NEXT: v_or_b32_e32 v5, v46, v5 -; GCN-NEXT: v_or_b32_e32 v6, v45, v6 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v8, v43, v8 -; GCN-NEXT: v_or_b32_e32 v9, v41, v9 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v24, v10 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v24, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v15 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v24, v16 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v24, v17 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v24, v18 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v24, v19 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v24, v20 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v24, v21 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v24, v22 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48i16_to_v12i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v46 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v45 +; SI-NEXT: v_or_b32_e32 v5, v5, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v42 +; SI-NEXT: v_or_b32_e32 v11, v11, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v33 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v32 +; SI-NEXT: v_or_b32_e32 v16, v16, v63 +; SI-NEXT: v_or_b32_e32 v17, v17, v62 +; SI-NEXT: v_or_b32_e32 v18, v18, v61 +; SI-NEXT: v_or_b32_e32 v19, v19, v60 +; SI-NEXT: v_or_b32_e32 v20, v20, v59 +; SI-NEXT: v_or_b32_e32 v21, v21, v58 +; SI-NEXT: v_or_b32_e32 v22, v22, v57 +; SI-NEXT: v_or_b32_e32 v23, v23, v56 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v2, v46, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v45, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v44, v6 +; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v42, v10 +; SI-NEXT: v_or_b32_e32 v11, v34, v11 +; SI-NEXT: v_or_b32_e32 v12, v41, v12 +; SI-NEXT: v_or_b32_e32 v13, v33, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_or_b32_e32 v16, v63, v16 +; SI-NEXT: v_or_b32_e32 v17, v62, v17 +; SI-NEXT: v_or_b32_e32 v18, v61, v18 +; SI-NEXT: v_or_b32_e32 v19, v60, v19 +; SI-NEXT: v_or_b32_e32 v20, v59, v20 +; SI-NEXT: v_or_b32_e32 v21, v58, v21 +; SI-NEXT: v_or_b32_e32 v22, v57, v22 +; SI-NEXT: v_or_b32_e32 v23, v56, v23 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48i16_to_v12i64: ; VI: ; %bb.0: @@ -11012,7 +23469,7 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v23, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v23, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -11087,9 +23544,9 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB21_2: ; %Flow +; VI-NEXT: .LBB42_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_4 +; VI-NEXT: s_cbranch_execz .LBB42_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v23, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v47 @@ -11164,7 +23621,7 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v24, 3, v32 ; VI-NEXT: v_add_u16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: .LBB21_4: ; %end +; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -11263,7 +23720,7 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload @@ -11371,9 +23828,9 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; kill: killed $vgpr24 -; GFX9-NEXT: .LBB21_2: ; %Flow +; GFX9-NEXT: .LBB42_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -11455,7 +23912,7 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_4: ; %end +; GFX9-NEXT: .LBB42_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -11484,7 +23941,7 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -11510,7 +23967,7 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11570,7 +24027,7 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -11596,9 +24053,1028 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <48 x i16> %a, splat (i16 3) + %a2 = bitcast <48 x i16> %a1 to <12 x i64> + br label %end + +cmp.false: + %a3 = bitcast <48 x i16> %a to <12 x i64> + br label %end + +end: + %phi = phi <12 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i64> %phi +} + +define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48i16_to_v12i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v12, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v13, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v15, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v22, v0, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v58 +; SI-NEXT: v_or_b32_e32 v23, v0, v34 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: v_mov_b32_e32 v43, v34 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v25 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v46, v27 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v56, v32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v39, v37 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_mov_b32_e32 v48, v38 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v36, v24 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v57, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v26, v35 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v25, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v57 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v41, v42 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v24, v36 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v48, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: v_mov_b32_e32 v32, v56 +; SI-NEXT: v_mov_b32_e32 v33, v47 +; SI-NEXT: v_mov_b32_e32 v27, v46 +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v48i16_to_v12i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v49, v9 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_mov_b32_e32 v35, v7 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v39, v1 +; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_or_b32_sdwa v14, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_or_b32_sdwa v15, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v18, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v19, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_or_b32_sdwa v20, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v21, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_mov_b32_e32 v4, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v39 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v5 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v37 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v7, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v38 +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v5 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v0, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v4, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v5 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v7 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v4 +; VI-NEXT: v_or_b32_sdwa v4, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v4 +; VI-NEXT: v_or_b32_sdwa v4, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v4 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v48i16_to_v12i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v9 +; GFX9-NEXT: v_mov_b32_e32 v33, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v7 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: v_mov_b32_e32 v36, v5 +; GFX9-NEXT: v_mov_b32_e32 v37, v4 +; GFX9-NEXT: v_mov_b32_e32 v38, v3 +; GFX9-NEXT: v_mov_b32_e32 v39, v2 +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_mov_b32_e32 v49, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v49 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v23 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v12i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v3 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB43_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; +; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v12i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB43_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -11617,531 +25093,509 @@ end: } define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i64_to_v48f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v27 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_mov_b32_e32 v37, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v39 -; GCN-NEXT: v_mov_b32_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v49 -; GCN-NEXT: v_mov_b32_e32 v49, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v61 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v59 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v57 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v47 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v46 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v44 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v42 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v40 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v55 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v53 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v41 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v25, v27, v25 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_or_b32_e32 v37, v39, v38 -; GCN-NEXT: v_or_b32_e32 v38, v49, v48 -; GCN-NEXT: v_or_b32_e32 v39, v51, v50 -; GCN-NEXT: v_or_b32_e32 v48, v54, v53 -; GCN-NEXT: v_or_b32_e32 v49, v55, v52 -; GCN-NEXT: v_or_b32_e32 v50, v41, v40 -; GCN-NEXT: v_or_b32_e32 v51, v43, v42 -; GCN-NEXT: v_or_b32_e32 v52, v45, v44 -; GCN-NEXT: v_or_b32_e32 v53, v47, v46 -; GCN-NEXT: v_or_b32_e32 v54, v57, v56 -; GCN-NEXT: v_or_b32_e32 v55, v59, v58 -; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i64_to_v48f16: +; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i64_to_v48f16: ; VI: ; %bb.0: @@ -12173,7 +25627,7 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -12199,9 +25653,9 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB22_2: ; %Flow +; VI-NEXT: .LBB44_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_4 +; VI-NEXT: s_cbranch_execz .LBB44_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 ; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc @@ -12251,7 +25705,7 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB22_4: ; %end +; VI-NEXT: .LBB44_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 @@ -12333,7 +25787,7 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -12359,9 +25813,9 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB22_2: ; %Flow +; GFX9-NEXT: .LBB44_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_4 +; GFX9-NEXT: s_cbranch_execz .LBB44_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc @@ -12411,7 +25865,7 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB22_4: ; %end +; GFX9-NEXT: .LBB44_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v55, v0, s4 @@ -12448,7 +25902,7 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -12480,7 +25934,7 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: .LBB44_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12515,7 +25969,7 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -12541,9 +25995,9 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB44_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -12599,7 +26053,7 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: .LBB44_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 @@ -12644,538 +26098,1700 @@ end: ret <48 x half> %phi } +define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i64_to_v48f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_readfirstlane_b32 s14, v1 +; SI-NEXT: v_readfirstlane_b32 s15, v2 +; SI-NEXT: v_readfirstlane_b32 s12, v3 +; SI-NEXT: v_readfirstlane_b32 s13, v4 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_readfirstlane_b32 s11, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v8 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s16, s4, 16 +; SI-NEXT: s_lshr_b32 s17, s5, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s40, s18, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s42, s20, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s44, s22, 16 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s46, s24, 16 +; SI-NEXT: s_lshr_b32 s47, s25, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s56, s26, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s58, s28, 16 +; SI-NEXT: s_lshr_b32 s59, s29, 16 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_lshr_b32 s60, s14, 16 +; SI-NEXT: s_lshr_b32 s61, s15, 16 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_lshr_b32 s62, s12, 16 +; SI-NEXT: s_lshr_b32 s63, s13, 16 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_lshr_b32 s72, s10, 16 +; SI-NEXT: s_lshr_b32 s73, s11, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s74, s7, 16 +; SI-NEXT: s_lshr_b32 s75, s8, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_lshr_b32 s76, s6, 16 +; SI-NEXT: s_lshr_b32 s77, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v40, s16 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v55, v55, v40 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v53, v53, v54 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v12i64_to_v48f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v0 +; VI-NEXT: v_readfirstlane_b32 s14, v1 +; VI-NEXT: v_readfirstlane_b32 s13, v2 +; VI-NEXT: v_readfirstlane_b32 s12, v3 +; VI-NEXT: v_readfirstlane_b32 s11, v4 +; VI-NEXT: v_readfirstlane_b32 s10, v5 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s8, v7 +; VI-NEXT: v_readfirstlane_b32 s6, v8 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v9 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s40, s7, 16 +; VI-NEXT: s_lshr_b32 s41, s6, 16 +; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s43, s9, 16 +; VI-NEXT: s_lshr_b32 s44, s10, 16 +; VI-NEXT: s_lshr_b32 s45, s11, 16 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: s_lshr_b32 s56, s14, 16 +; VI-NEXT: s_lshr_b32 s57, s15, 16 +; VI-NEXT: s_lshr_b32 s58, s29, 16 +; VI-NEXT: s_lshr_b32 s59, s28, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 16 +; VI-NEXT: s_lshr_b32 s61, s26, 16 +; VI-NEXT: s_lshr_b32 s62, s25, 16 +; VI-NEXT: s_lshr_b32 s63, s24, 16 +; VI-NEXT: s_lshr_b32 s72, s23, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 16 +; VI-NEXT: s_lshr_b32 s74, s21, 16 +; VI-NEXT: s_lshr_b32 s75, s20, 16 +; VI-NEXT: s_lshr_b32 s76, s19, 16 +; VI-NEXT: s_lshr_b32 s77, s18, 16 +; VI-NEXT: s_lshr_b32 s78, s17, 16 +; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s40, s7, 16 +; VI-NEXT: s_lshr_b32 s41, s6, 16 +; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s43, s9, 16 +; VI-NEXT: s_lshr_b32 s44, s10, 16 +; VI-NEXT: s_lshr_b32 s45, s11, 16 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: s_lshr_b32 s56, s14, 16 +; VI-NEXT: s_lshr_b32 s57, s15, 16 +; VI-NEXT: s_lshr_b32 s58, s29, 16 +; VI-NEXT: s_lshr_b32 s59, s28, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 16 +; VI-NEXT: s_lshr_b32 s61, s26, 16 +; VI-NEXT: s_lshr_b32 s62, s25, 16 +; VI-NEXT: s_lshr_b32 s63, s24, 16 +; VI-NEXT: s_lshr_b32 s72, s23, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 16 +; VI-NEXT: s_lshr_b32 s74, s21, 16 +; VI-NEXT: s_lshr_b32 s75, s20, 16 +; VI-NEXT: s_lshr_b32 s76, s19, 16 +; VI-NEXT: s_lshr_b32 s77, s18, 16 +; VI-NEXT: s_lshr_b32 s78, s17, 16 +; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s79, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s78, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s77, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s76, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s75, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s74, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s73, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s72, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s63, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s62, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s61, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s60, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s59, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s58, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s28, s57, 16 +; VI-NEXT: s_or_b32 s15, s15, s28 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s28, s56, 16 +; VI-NEXT: s_or_b32 s14, s14, s28 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s28, s47, 16 +; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s28, s46, 16 +; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s45, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s44, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s28, s43, 16 +; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_or_b32 s7, s7, s28 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s15 +; VI-NEXT: v_mov_b32_e32 v15, s14 +; VI-NEXT: v_mov_b32_e32 v16, s13 +; VI-NEXT: v_mov_b32_e32 v17, s12 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_mov_b32_e32 v19, s10 +; VI-NEXT: v_mov_b32_e32 v20, s9 +; VI-NEXT: v_mov_b32_e32 v21, s8 +; VI-NEXT: v_mov_b32_e32 v22, s6 +; VI-NEXT: v_mov_b32_e32 v23, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v12i64_to_v48f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s40, s15, 16 +; GFX9-NEXT: s_lshr_b32 s41, s14, 16 +; GFX9-NEXT: s_lshr_b32 s42, s13, 16 +; GFX9-NEXT: s_lshr_b32 s43, s12, 16 +; GFX9-NEXT: s_lshr_b32 s44, s11, 16 +; GFX9-NEXT: s_lshr_b32 s45, s10, 16 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_lshr_b32 s47, s8, 16 +; GFX9-NEXT: s_lshr_b32 s56, s7, 16 +; GFX9-NEXT: s_lshr_b32 s57, s6, 16 +; GFX9-NEXT: s_lshr_b32 s58, s29, 16 +; GFX9-NEXT: s_lshr_b32 s59, s28, 16 +; GFX9-NEXT: s_lshr_b32 s60, s27, 16 +; GFX9-NEXT: s_lshr_b32 s61, s26, 16 +; GFX9-NEXT: s_lshr_b32 s62, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s24, 16 +; GFX9-NEXT: s_lshr_b32 s72, s23, 16 +; GFX9-NEXT: s_lshr_b32 s73, s22, 16 +; GFX9-NEXT: s_lshr_b32 s74, s21, 16 +; GFX9-NEXT: s_lshr_b32 s75, s20, 16 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s77, s18, 16 +; GFX9-NEXT: s_lshr_b32 s78, s17, 16 +; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s40, s15, 16 +; GFX9-NEXT: s_lshr_b32 s41, s14, 16 +; GFX9-NEXT: s_lshr_b32 s42, s13, 16 +; GFX9-NEXT: s_lshr_b32 s43, s12, 16 +; GFX9-NEXT: s_lshr_b32 s44, s11, 16 +; GFX9-NEXT: s_lshr_b32 s45, s10, 16 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_lshr_b32 s47, s8, 16 +; GFX9-NEXT: s_lshr_b32 s56, s7, 16 +; GFX9-NEXT: s_lshr_b32 s57, s6, 16 +; GFX9-NEXT: s_lshr_b32 s58, s29, 16 +; GFX9-NEXT: s_lshr_b32 s59, s28, 16 +; GFX9-NEXT: s_lshr_b32 s60, s27, 16 +; GFX9-NEXT: s_lshr_b32 s61, s26, 16 +; GFX9-NEXT: s_lshr_b32 s62, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s24, 16 +; GFX9-NEXT: s_lshr_b32 s72, s23, 16 +; GFX9-NEXT: s_lshr_b32 s73, s22, 16 +; GFX9-NEXT: s_lshr_b32 s74, s21, 16 +; GFX9-NEXT: s_lshr_b32 s75, s20, 16 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s77, s18, 16 +; GFX9-NEXT: s_lshr_b32 s78, s17, 16 +; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s40 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-TRUE16-LABEL: bitcast_v12i64_to_v48f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v5 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-TRUE16-NEXT: .LBB45_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s9 :: v_dual_mov_b32 v19, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v23, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB45_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB45_2 +; +; GFX11-FAKE16-LABEL: bitcast_v12i64_to_v48f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-FAKE16-NEXT: s_mov_b32 s74, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-FAKE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-FAKE16-NEXT: .LBB45_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB45_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: s_branch .LBB45_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i64> %a, splat (i64 3) + %a2 = bitcast <12 x i64> %a1 to <48 x half> + br label %end + +cmp.false: + %a3 = bitcast <12 x i64> %a to <48 x half> + br label %end + +end: + %phi = phi <48 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x half> %phi +} + define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v48f16_to_v12i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v52, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; GCN-NEXT: v_or_b32_e32 v2, v50, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; GCN-NEXT: v_or_b32_e32 v4, v38, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; GCN-NEXT: v_or_b32_e32 v5, v36, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_or_b32_e32 v6, v34, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v24, v10 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v24, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v15 -; GCN-NEXT: v_or_b32_e32 v16, v61, v16 -; GCN-NEXT: v_or_b32_e32 v17, v59, v17 -; GCN-NEXT: v_or_b32_e32 v18, v57, v18 -; GCN-NEXT: v_or_b32_e32 v19, v47, v19 -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: v_or_b32_e32 v21, v43, v21 -; GCN-NEXT: v_or_b32_e32 v22, v41, v22 -; GCN-NEXT: v_or_b32_e32 v23, v40, v23 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v50 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v48 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v36 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v34 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v40 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_or_b32_e32 v11, v13, v12 -; GCN-NEXT: v_or_b32_e32 v12, v15, v14 -; GCN-NEXT: v_or_b32_e32 v13, v17, v16 -; GCN-NEXT: v_or_b32_e32 v14, v19, v18 -; GCN-NEXT: v_or_b32_e32 v15, v21, v20 -; GCN-NEXT: v_or_b32_e32 v16, v23, v22 -; GCN-NEXT: v_or_b32_e32 v17, v25, v24 -; GCN-NEXT: v_or_b32_e32 v18, v27, v26 -; GCN-NEXT: v_or_b32_e32 v19, v29, v28 -; GCN-NEXT: v_or_b32_e32 v20, v31, v30 -; GCN-NEXT: v_or_b32_e32 v21, v33, v32 -; GCN-NEXT: v_or_b32_e32 v22, v35, v34 -; GCN-NEXT: v_or_b32_e32 v23, v37, v36 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48f16_to_v12i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v41 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v2, v50, v2 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 +; SI-NEXT: v_or_b32_e32 v21, v44, v21 +; SI-NEXT: v_or_b32_e32 v22, v42, v22 +; SI-NEXT: v_or_b32_e32 v23, v40, v23 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_or_b32_e32 v20, v46, v20 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48f16_to_v12i64: ; VI: ; %bb.0: @@ -13216,7 +27832,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v23, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v23, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -13291,9 +27907,9 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB23_2: ; %Flow +; VI-NEXT: .LBB46_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_4 +; VI-NEXT: s_cbranch_execz .LBB46_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v23, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v47, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -13368,7 +27984,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v24, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: .LBB23_4: ; %end +; VI-NEXT: .LBB46_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -13467,7 +28083,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload @@ -13575,9 +28191,9 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; kill: killed $vgpr24 -; GFX9-NEXT: .LBB23_2: ; %Flow +; GFX9-NEXT: .LBB46_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -13660,7 +28276,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_4: ; %end +; GFX9-NEXT: .LBB46_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -13689,7 +28305,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -13715,7 +28331,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13775,7 +28391,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -13801,7 +28417,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -13821,257 +28437,1379 @@ end: ret <12 x i64> %phi } +define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48f16_to_v12i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v34 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v55, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v49, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v37, v12 +; SI-NEXT: v_or_b32_e32 v13, v61, v13 +; SI-NEXT: v_or_b32_e32 v14, v33, v14 +; SI-NEXT: v_or_b32_e32 v15, v59, v15 +; SI-NEXT: v_or_b32_e32 v16, v57, v16 +; SI-NEXT: v_or_b32_e32 v17, v47, v17 +; SI-NEXT: v_or_b32_e32 v18, v45, v18 +; SI-NEXT: v_or_b32_e32 v19, v25, v19 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v21, v28, v21 +; SI-NEXT: v_or_b32_e32 v22, v26, v22 +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v41 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v28 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v46 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v32 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v59 +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v60 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v42 +; SI-NEXT: v_mov_b32_e32 v42, v26 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v55, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v53, v43 +; SI-NEXT: v_mov_b32_e32 v52, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v25 +; SI-NEXT: v_mov_b32_e32 v43, v27 +; SI-NEXT: v_mov_b32_e32 v33, v62 +; SI-NEXT: v_mov_b32_e32 v62, v57 +; SI-NEXT: v_mov_b32_e32 v57, v44 +; SI-NEXT: v_mov_b32_e32 v44, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v27, v43 +; SI-NEXT: v_mov_b32_e32 v25, v56 +; SI-NEXT: v_mov_b32_e32 v56, v61 +; SI-NEXT: v_mov_b32_e32 v61, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v52 +; SI-NEXT: v_mov_b32_e32 v43, v53 +; SI-NEXT: v_mov_b32_e32 v52, v55 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v28, v44 +; SI-NEXT: v_mov_b32_e32 v44, v57 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: v_mov_b32_e32 v41, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: v_mov_b32_e32 v59, v34 +; SI-NEXT: v_mov_b32_e32 v32, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v48f16_to_v12i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v34, v7 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB47_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v23, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v49, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v48, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v39, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v38, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v37, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v35, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v34, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v33, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v22, v24, v22 +; VI-NEXT: v_add_f16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v48f16_to_v12i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v9 +; GFX9-NEXT: v_mov_b32_e32 v33, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v7 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: v_mov_b32_e32 v36, v5 +; GFX9-NEXT: v_mov_b32_e32 v37, v4 +; GFX9-NEXT: v_mov_b32_e32 v38, v3 +; GFX9-NEXT: v_mov_b32_e32 v39, v2 +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_mov_b32_e32 v49, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v49 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v23 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB47_3: ; %end +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB47_2 +; +; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v12i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v3 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB47_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; +; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v12i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB47_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <48 x half> %a, splat (half 0xH0200) + %a2 = bitcast <48 x half> %a1 to <12 x i64> + br label %end + +cmp.false: + %a3 = bitcast <48 x half> %a to <12 x i64> + br label %end + +end: + %phi = phi <12 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i64> %phi +} + define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f64_to_v48i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v29, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v30, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v50, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v29, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v30, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v50, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v44, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_and_b32_e32 v47, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v53, v41, v53 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v40, v42, v40 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v50, v43, v50 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 56, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v55, v44, v55 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v48, v45, v48 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v54, v46, v54 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 0x44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v39, v47, v39 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 0x48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v52, v56, v52 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 0x4c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v9, v9, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x50, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v11, v11, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x58, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v12, v12, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_or_b32_e32 v13, v13, v30 -; GCN-NEXT: v_or_b32_e32 v14, v14, v36 -; GCN-NEXT: v_or_b32_e32 v15, v15, v29 -; GCN-NEXT: v_or_b32_e32 v16, v16, v35 -; GCN-NEXT: v_or_b32_e32 v17, v17, v28 -; GCN-NEXT: v_or_b32_e32 v18, v18, v34 -; GCN-NEXT: v_or_b32_e32 v19, v19, v27 -; GCN-NEXT: v_or_b32_e32 v20, v20, v33 -; GCN-NEXT: v_or_b32_e32 v21, v21, v26 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v23, v23, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v31 -; GCN-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f64_to_v48i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v35, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v35, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f64_to_v48i16: ; VI: ; %bb.0: @@ -14103,7 +29841,7 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -14129,9 +29867,9 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -14169,7 +29907,7 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 @@ -14251,7 +29989,7 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -14277,9 +30015,9 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -14317,7 +30055,7 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v55, v0, s4 @@ -14354,7 +30092,7 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -14368,7 +30106,7 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: .LBB48_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -14403,7 +30141,7 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -14429,9 +30167,9 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -14469,7 +30207,7 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 @@ -14514,419 +30252,1354 @@ end: ret <48 x i16> %phi } +define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f64_to_v48i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v23, s16 +; SI-NEXT: v_mov_b32_e32 v24, s17 +; SI-NEXT: v_mov_b32_e32 v21, s18 +; SI-NEXT: v_mov_b32_e32 v22, s19 +; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_mov_b32_e32 v17, s22 +; SI-NEXT: v_mov_b32_e32 v18, s23 +; SI-NEXT: v_mov_b32_e32 v15, s24 +; SI-NEXT: v_mov_b32_e32 v16, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v11, s28 +; SI-NEXT: v_mov_b32_e32 v12, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v27, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v28, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v35, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v50, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v53, v24, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v27, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v28, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v35, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v50, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v53, v24, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_or_b32_e32 v23, v23, v53 +; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v50 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v54 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v52 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v12f64_to_v48i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, s16 +; VI-NEXT: v_mov_b32_e32 v24, s17 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v16, s21 +; VI-NEXT: v_mov_b32_e32 v11, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v21, s24 +; VI-NEXT: v_mov_b32_e32 v22, s25 +; VI-NEXT: v_mov_b32_e32 v17, s26 +; VI-NEXT: v_mov_b32_e32 v18, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s28 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v23, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v19, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v55 +; VI-NEXT: v_or_b32_sdwa v24, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_or_b32_sdwa v28, v21, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_or_b32_sdwa v29, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v17, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v30 +; VI-NEXT: v_mov_b32_e32 v1, v31 +; VI-NEXT: v_mov_b32_e32 v2, v32 +; VI-NEXT: v_mov_b32_e32 v3, v33 +; VI-NEXT: v_mov_b32_e32 v4, v24 +; VI-NEXT: v_mov_b32_e32 v5, v25 +; VI-NEXT: v_mov_b32_e32 v6, v26 +; VI-NEXT: v_mov_b32_e32 v7, v27 +; VI-NEXT: v_mov_b32_e32 v8, v28 +; VI-NEXT: v_mov_b32_e32 v9, v29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v12f64_to_v48i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, s16 +; GFX9-NEXT: v_mov_b32_e32 v24, s17 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v16, s21 +; GFX9-NEXT: v_mov_b32_e32 v11, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 +; GFX9-NEXT: v_mov_b32_e32 v21, s24 +; GFX9-NEXT: v_mov_b32_e32 v22, s25 +; GFX9-NEXT: v_mov_b32_e32 v17, s26 +; GFX9-NEXT: v_mov_b32_e32 v18, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s28 +; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v22, v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v11, v54, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v53, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v52, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v23, v34, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-NEXT: v_mov_b32_e32 v1, v31 +; GFX9-NEXT: v_mov_b32_e32 v2, v32 +; GFX9-NEXT: v_mov_b32_e32 v3, v33 +; GFX9-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-NEXT: v_mov_b32_e32 v5, v25 +; GFX9-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-NEXT: v_mov_b32_e32 v8, v28 +; GFX9-NEXT: v_mov_b32_e32 v9, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_branch .LBB49_2 +; +; GFX11-TRUE16-LABEL: bitcast_v12f64_to_v48i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v17, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v15, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-TRUE16-NEXT: .LBB49_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v52, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v51, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v54, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v39, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v37, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v49, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v48, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v38, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v36, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; +; GFX11-FAKE16-LABEL: bitcast_v12f64_to_v48i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-FAKE16-NEXT: .LBB49_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v52, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v51, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v38, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <12 x double> %a1 to <48 x i16> + br label %end + +cmp.false: + %a3 = bitcast <12 x double> %a to <48 x i16> + br label %end + +end: + %phi = phi <48 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x i16> %phi +} + define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v48i16_to_v12f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v59 -; GCN-NEXT: v_or_b32_e32 v1, v1, v60 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v58 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v56 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v46 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v45 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v44 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v43 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v41 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v62 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v61 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v57 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v63 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v24 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v63 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v0, v59, v0 -; GCN-NEXT: v_or_b32_e32 v1, v60, v1 -; GCN-NEXT: v_or_b32_e32 v2, v58, v2 -; GCN-NEXT: v_or_b32_e32 v3, v56, v3 -; GCN-NEXT: v_or_b32_e32 v4, v47, v4 -; GCN-NEXT: v_or_b32_e32 v5, v46, v5 -; GCN-NEXT: v_or_b32_e32 v6, v45, v6 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v8, v43, v8 -; GCN-NEXT: v_or_b32_e32 v9, v41, v9 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v24, v10 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v24, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v15 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v24, v16 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v24, v17 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v24, v18 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v24, v19 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v24, v20 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v24, v21 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v24, v22 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48i16_to_v12f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v46 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v45 +; SI-NEXT: v_or_b32_e32 v5, v5, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v42 +; SI-NEXT: v_or_b32_e32 v11, v11, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v33 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v32 +; SI-NEXT: v_or_b32_e32 v16, v16, v63 +; SI-NEXT: v_or_b32_e32 v17, v17, v62 +; SI-NEXT: v_or_b32_e32 v18, v18, v61 +; SI-NEXT: v_or_b32_e32 v19, v19, v60 +; SI-NEXT: v_or_b32_e32 v20, v20, v59 +; SI-NEXT: v_or_b32_e32 v21, v21, v58 +; SI-NEXT: v_or_b32_e32 v22, v22, v57 +; SI-NEXT: v_or_b32_e32 v23, v23, v56 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v2, v46, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v45, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v44, v6 +; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v42, v10 +; SI-NEXT: v_or_b32_e32 v11, v34, v11 +; SI-NEXT: v_or_b32_e32 v12, v41, v12 +; SI-NEXT: v_or_b32_e32 v13, v33, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_or_b32_e32 v16, v63, v16 +; SI-NEXT: v_or_b32_e32 v17, v62, v17 +; SI-NEXT: v_or_b32_e32 v18, v61, v18 +; SI-NEXT: v_or_b32_e32 v19, v60, v19 +; SI-NEXT: v_or_b32_e32 v20, v59, v20 +; SI-NEXT: v_or_b32_e32 v21, v58, v21 +; SI-NEXT: v_or_b32_e32 v22, v57, v22 +; SI-NEXT: v_or_b32_e32 v23, v56, v23 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48i16_to_v12f64: ; VI: ; %bb.0: @@ -14967,7 +31640,7 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v23, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v23, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -15042,9 +31715,9 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v23, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v47 @@ -15119,7 +31792,7 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v24, 3, v32 ; VI-NEXT: v_add_u16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -15218,7 +31891,7 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload @@ -15326,9 +31999,9 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; kill: killed $vgpr24 -; GFX9-NEXT: .LBB25_2: ; %Flow +; GFX9-NEXT: .LBB50_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 +; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -15410,7 +32083,7 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: .LBB50_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -15439,7 +32112,7 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -15465,7 +32138,7 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15525,7 +32198,7 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -15551,7 +32224,7 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -15571,508 +32244,1505 @@ end: ret <12 x double> %phi } +define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48i16_to_v12f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v12, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v13, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v15, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v22, v0, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v58 +; SI-NEXT: v_or_b32_e32 v23, v0, v34 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: v_mov_b32_e32 v43, v34 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v25 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v46, v27 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v56, v32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v39, v37 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_mov_b32_e32 v48, v38 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v36, v24 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v57, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v26, v35 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v25, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v57 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v41, v42 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v24, v36 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v48, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: v_mov_b32_e32 v32, v56 +; SI-NEXT: v_mov_b32_e32 v33, v47 +; SI-NEXT: v_mov_b32_e32 v27, v46 +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v48i16_to_v12f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v49, v9 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_mov_b32_e32 v35, v7 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v39, v1 +; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_or_b32_sdwa v14, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_or_b32_sdwa v15, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v18, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v19, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_or_b32_sdwa v20, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v21, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_mov_b32_e32 v4, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v39 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v5 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v37 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v7, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v38 +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v5 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v0, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v4, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v5 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v4, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v7 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v4 +; VI-NEXT: v_or_b32_sdwa v4, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v4 +; VI-NEXT: v_or_b32_sdwa v4, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v4 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v48i16_to_v12f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v9 +; GFX9-NEXT: v_mov_b32_e32 v33, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v7 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: v_mov_b32_e32 v36, v5 +; GFX9-NEXT: v_mov_b32_e32 v37, v4 +; GFX9-NEXT: v_mov_b32_e32 v38, v3 +; GFX9-NEXT: v_mov_b32_e32 v39, v2 +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_mov_b32_e32 v49, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v49 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v23 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v12f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v3 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v12f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <48 x i16> %a, splat (i16 3) + %a2 = bitcast <48 x i16> %a1 to <12 x double> + br label %end + +cmp.false: + %a3 = bitcast <48 x i16> %a to <12 x double> + br label %end + +end: + %phi = phi <12 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x double> %phi +} + define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f64_to_v48f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v27 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_mov_b32_e32 v37, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v39 -; GCN-NEXT: v_mov_b32_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v49 -; GCN-NEXT: v_mov_b32_e32 v49, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v61 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v59 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v57 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v47 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v46 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v44 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v42 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v40 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v55 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v53 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v41 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v25, v27, v25 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_or_b32_e32 v37, v39, v38 -; GCN-NEXT: v_or_b32_e32 v38, v49, v48 -; GCN-NEXT: v_or_b32_e32 v39, v51, v50 -; GCN-NEXT: v_or_b32_e32 v48, v54, v53 -; GCN-NEXT: v_or_b32_e32 v49, v55, v52 -; GCN-NEXT: v_or_b32_e32 v50, v41, v40 -; GCN-NEXT: v_or_b32_e32 v51, v43, v42 -; GCN-NEXT: v_or_b32_e32 v52, v45, v44 -; GCN-NEXT: v_or_b32_e32 v53, v47, v46 -; GCN-NEXT: v_or_b32_e32 v54, v57, v56 -; GCN-NEXT: v_or_b32_e32 v55, v59, v58 -; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f64_to_v48f16: +; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f64_to_v48f16: ; VI: ; %bb.0: @@ -16104,7 +33774,7 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -16130,9 +33800,9 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB26_2: ; %Flow +; VI-NEXT: .LBB52_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_4 +; VI-NEXT: s_cbranch_execz .LBB52_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -16170,7 +33840,7 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB26_4: ; %end +; VI-NEXT: .LBB52_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 @@ -16252,7 +33922,7 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -16278,9 +33948,9 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB26_2: ; %Flow +; GFX9-NEXT: .LBB52_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_4 +; GFX9-NEXT: s_cbranch_execz .LBB52_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -16318,7 +33988,7 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB26_4: ; %end +; GFX9-NEXT: .LBB52_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v55, v0, s4 @@ -16355,7 +34025,7 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -16369,7 +34039,7 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16404,7 +34074,7 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -16430,9 +34100,9 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -16470,7 +34140,7 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_4: ; %end +; GFX11-FAKE16-NEXT: .LBB52_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 @@ -16515,538 +34185,1688 @@ end: ret <48 x half> %phi } +define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f64_to_v48f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v1 +; SI-NEXT: v_readfirstlane_b32 s13, v2 +; SI-NEXT: v_readfirstlane_b32 s10, v3 +; SI-NEXT: v_readfirstlane_b32 s11, v4 +; SI-NEXT: v_readfirstlane_b32 s8, v5 +; SI-NEXT: v_readfirstlane_b32 s9, v6 +; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: v_readfirstlane_b32 s7, v8 +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: s_and_b64 s[14:15], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s14, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 +; SI-NEXT: s_lshr_b32 s14, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: s_lshr_b32 s14, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s14 +; SI-NEXT: s_lshr_b32 s14, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: s_lshr_b32 s14, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s14 +; SI-NEXT: s_lshr_b32 s14, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s14 +; SI-NEXT: s_lshr_b32 s14, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s14 +; SI-NEXT: s_lshr_b32 s14, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 +; SI-NEXT: s_lshr_b32 s14, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s14 +; SI-NEXT: s_lshr_b32 s14, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s14 +; SI-NEXT: s_lshr_b32 s14, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: s_lshr_b32 s14, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s14 +; SI-NEXT: s_lshr_b32 s14, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s14 +; SI-NEXT: s_lshr_b32 s14, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 +; SI-NEXT: s_lshr_b32 s14, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s14 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s14 +; SI-NEXT: s_lshr_b32 s14, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s14 +; SI-NEXT: s_lshr_b32 s14, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s14 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s14 +; SI-NEXT: s_lshr_b32 s14, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s14 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s14 +; SI-NEXT: s_lshr_b32 s14, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[50:51], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[37:38], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[33:34], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[29:30], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[27:28], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[22:23], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v55, v55, v40 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v53, v53, v54 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v12f64_to_v48f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, s16 +; VI-NEXT: v_mov_b32_e32 v24, s17 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v16, s21 +; VI-NEXT: v_mov_b32_e32 v11, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v21, s24 +; VI-NEXT: v_mov_b32_e32 v22, s25 +; VI-NEXT: v_mov_b32_e32 v17, s26 +; VI-NEXT: v_mov_b32_e32 v18, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s28 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v23, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v19, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v55 +; VI-NEXT: v_or_b32_sdwa v24, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_or_b32_sdwa v28, v21, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_or_b32_sdwa v29, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v17, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v30 +; VI-NEXT: v_mov_b32_e32 v1, v31 +; VI-NEXT: v_mov_b32_e32 v2, v32 +; VI-NEXT: v_mov_b32_e32 v3, v33 +; VI-NEXT: v_mov_b32_e32 v4, v24 +; VI-NEXT: v_mov_b32_e32 v5, v25 +; VI-NEXT: v_mov_b32_e32 v6, v26 +; VI-NEXT: v_mov_b32_e32 v7, v27 +; VI-NEXT: v_mov_b32_e32 v8, v28 +; VI-NEXT: v_mov_b32_e32 v9, v29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v12f64_to_v48f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, s16 +; GFX9-NEXT: v_mov_b32_e32 v24, s17 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v16, s21 +; GFX9-NEXT: v_mov_b32_e32 v11, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 +; GFX9-NEXT: v_mov_b32_e32 v21, s24 +; GFX9-NEXT: v_mov_b32_e32 v22, s25 +; GFX9-NEXT: v_mov_b32_e32 v17, s26 +; GFX9-NEXT: v_mov_b32_e32 v18, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s28 +; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v22, v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v11, v54, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v53, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v52, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v23, v34, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-NEXT: v_mov_b32_e32 v1, v31 +; GFX9-NEXT: v_mov_b32_e32 v2, v32 +; GFX9-NEXT: v_mov_b32_e32 v3, v33 +; GFX9-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-NEXT: v_mov_b32_e32 v5, v25 +; GFX9-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-NEXT: v_mov_b32_e32 v8, v28 +; GFX9-NEXT: v_mov_b32_e32 v9, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-TRUE16-LABEL: bitcast_v12f64_to_v48f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v17, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v15, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-TRUE16-NEXT: .LBB53_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v52, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v51, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v54, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v39, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v37, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v49, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v48, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v38, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v36, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB53_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB53_2 +; +; GFX11-FAKE16-LABEL: bitcast_v12f64_to_v48f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-FAKE16-NEXT: .LBB53_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v52, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v51, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v38, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB53_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <12 x double> %a1 to <48 x half> + br label %end + +cmp.false: + %a3 = bitcast <12 x double> %a to <48 x half> + br label %end + +end: + %phi = phi <48 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x half> %phi +} + define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v48f16_to_v12f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v52, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; GCN-NEXT: v_or_b32_e32 v2, v50, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; GCN-NEXT: v_or_b32_e32 v4, v38, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; GCN-NEXT: v_or_b32_e32 v5, v36, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_or_b32_e32 v6, v34, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v24, v10 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v24, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v15 -; GCN-NEXT: v_or_b32_e32 v16, v61, v16 -; GCN-NEXT: v_or_b32_e32 v17, v59, v17 -; GCN-NEXT: v_or_b32_e32 v18, v57, v18 -; GCN-NEXT: v_or_b32_e32 v19, v47, v19 -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: v_or_b32_e32 v21, v43, v21 -; GCN-NEXT: v_or_b32_e32 v22, v41, v22 -; GCN-NEXT: v_or_b32_e32 v23, v40, v23 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v50 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v48 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v36 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v34 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v40 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_or_b32_e32 v11, v13, v12 -; GCN-NEXT: v_or_b32_e32 v12, v15, v14 -; GCN-NEXT: v_or_b32_e32 v13, v17, v16 -; GCN-NEXT: v_or_b32_e32 v14, v19, v18 -; GCN-NEXT: v_or_b32_e32 v15, v21, v20 -; GCN-NEXT: v_or_b32_e32 v16, v23, v22 -; GCN-NEXT: v_or_b32_e32 v17, v25, v24 -; GCN-NEXT: v_or_b32_e32 v18, v27, v26 -; GCN-NEXT: v_or_b32_e32 v19, v29, v28 -; GCN-NEXT: v_or_b32_e32 v20, v31, v30 -; GCN-NEXT: v_or_b32_e32 v21, v33, v32 -; GCN-NEXT: v_or_b32_e32 v22, v35, v34 -; GCN-NEXT: v_or_b32_e32 v23, v37, v36 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48f16_to_v12f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v41 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v2, v50, v2 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 +; SI-NEXT: v_or_b32_e32 v21, v44, v21 +; SI-NEXT: v_or_b32_e32 v22, v42, v22 +; SI-NEXT: v_or_b32_e32 v23, v40, v23 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_or_b32_e32 v20, v46, v20 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48f16_to_v12f64: ; VI: ; %bb.0: @@ -17087,7 +35907,7 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v23, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v23, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -17162,9 +35982,9 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: .LBB54_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_4 +; VI-NEXT: s_cbranch_execz .LBB54_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v23, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v47, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -17239,7 +36059,7 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v24, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: .LBB27_4: ; %end +; VI-NEXT: .LBB54_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -17338,7 +36158,7 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload @@ -17446,9 +36266,9 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; kill: killed $vgpr24 -; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: .LBB54_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -17531,7 +36351,7 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_4: ; %end +; GFX9-NEXT: .LBB54_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -17560,7 +36380,7 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -17586,7 +36406,7 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: .LBB54_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17646,7 +36466,7 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -17672,9 +36492,1135 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB27_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB54_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <48 x half> %a, splat (half 0xH0200) + %a2 = bitcast <48 x half> %a1 to <12 x double> + br label %end + +cmp.false: + %a3 = bitcast <48 x half> %a to <12 x double> + br label %end + +end: + %phi = phi <12 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x double> %phi +} + +define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48f16_to_v12f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v34 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v55, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v49, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v37, v12 +; SI-NEXT: v_or_b32_e32 v13, v61, v13 +; SI-NEXT: v_or_b32_e32 v14, v33, v14 +; SI-NEXT: v_or_b32_e32 v15, v59, v15 +; SI-NEXT: v_or_b32_e32 v16, v57, v16 +; SI-NEXT: v_or_b32_e32 v17, v47, v17 +; SI-NEXT: v_or_b32_e32 v18, v45, v18 +; SI-NEXT: v_or_b32_e32 v19, v25, v19 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v21, v28, v21 +; SI-NEXT: v_or_b32_e32 v22, v26, v22 +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v41 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v28 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v46 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v32 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v59 +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v60 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v42 +; SI-NEXT: v_mov_b32_e32 v42, v26 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v55, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v53, v43 +; SI-NEXT: v_mov_b32_e32 v52, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v25 +; SI-NEXT: v_mov_b32_e32 v43, v27 +; SI-NEXT: v_mov_b32_e32 v33, v62 +; SI-NEXT: v_mov_b32_e32 v62, v57 +; SI-NEXT: v_mov_b32_e32 v57, v44 +; SI-NEXT: v_mov_b32_e32 v44, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v27, v43 +; SI-NEXT: v_mov_b32_e32 v25, v56 +; SI-NEXT: v_mov_b32_e32 v56, v61 +; SI-NEXT: v_mov_b32_e32 v61, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v52 +; SI-NEXT: v_mov_b32_e32 v43, v53 +; SI-NEXT: v_mov_b32_e32 v52, v55 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v28, v44 +; SI-NEXT: v_mov_b32_e32 v44, v57 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: v_mov_b32_e32 v41, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: v_mov_b32_e32 v59, v34 +; SI-NEXT: v_mov_b32_e32 v32, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v48f16_to_v12f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v34, v7 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v23, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v49, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v48, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v39, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v38, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v37, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v35, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v34, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v33, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v22, v24, v22 +; VI-NEXT: v_add_f16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v48f16_to_v12f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v9 +; GFX9-NEXT: v_mov_b32_e32 v33, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v7 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: v_mov_b32_e32 v36, v5 +; GFX9-NEXT: v_mov_b32_e32 v37, v4 +; GFX9-NEXT: v_mov_b32_e32 v38, v3 +; GFX9-NEXT: v_mov_b32_e32 v39, v2 +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_mov_b32_e32 v49, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v49 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v23 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v12f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v3 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB55_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB55_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB55_2 +; +; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v12f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB55_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB55_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB55_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -17693,697 +37639,713 @@ end: } define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v48i16_to_v48f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v35 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v36 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v37 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v38 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v39 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v48 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v49 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v50 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v51 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v52 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v11 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v53 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v33, v38, v33 -; GCN-NEXT: v_or_b32_e32 v32, v39, v32 -; GCN-NEXT: v_or_b32_e32 v38, v49, v48 -; GCN-NEXT: v_or_b32_e32 v39, v51, v50 -; GCN-NEXT: v_or_b32_e32 v48, v53, v52 -; GCN-NEXT: v_or_b32_e32 v49, v55, v54 -; GCN-NEXT: v_or_b32_e32 v50, v41, v40 -; GCN-NEXT: v_or_b32_e32 v51, v43, v42 -; GCN-NEXT: v_or_b32_e32 v52, v45, v44 -; GCN-NEXT: v_or_b32_e32 v53, v47, v46 -; GCN-NEXT: v_or_b32_e32 v54, v57, v56 -; GCN-NEXT: v_or_b32_e32 v55, v59, v58 -; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48i16_to_v48f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v40 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48i16_to_v48f16: ; VI: ; %bb.0: @@ -18416,7 +38378,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v55, 3, v55 @@ -18466,7 +38428,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v26, 3, v26 ; VI-NEXT: v_add_u16_e32 v23, 3, v23 ; VI-NEXT: v_add_u16_e32 v25, 3, v25 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; VI-NEXT: v_or_b32_sdwa v4, v4, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -18549,7 +38511,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v23, v55, v23, s6 @@ -18624,7 +38586,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v23 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v24, v0, s4 @@ -18661,7 +38623,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] @@ -18687,7 +38649,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: .LBB56_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18723,7 +38685,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v22, v55, v22, 0x5040100 @@ -18797,7 +38759,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v22 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v23 -; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: .LBB56_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v25, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v26, v1, 0x5040100 @@ -18841,450 +38803,1845 @@ end: ret <48 x half> %phi } +define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48i16_to_v48f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v41 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s18 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s21 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v14 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v40 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v54 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v48i16_to_v48f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s15, s18, s15 +; VI-NEXT: s_and_b32 s18, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; VI-NEXT: s_or_b32 s14, s18, s14 +; VI-NEXT: s_and_b32 s18, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_or_b32_sdwa v14, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: s_or_b32 s13, s18, s13 +; VI-NEXT: s_and_b32 s18, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: s_or_b32 s12, s18, s12 +; VI-NEXT: s_and_b32 s18, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: s_or_b32 s11, s18, s11 +; VI-NEXT: s_and_b32 s18, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: s_or_b32 s10, s18, s10 +; VI-NEXT: s_and_b32 s18, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: s_or_b32 s9, s18, s9 +; VI-NEXT: s_and_b32 s18, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: s_or_b32 s8, s18, s8 +; VI-NEXT: s_and_b32 s18, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; VI-NEXT: s_or_b32 s7, s18, s7 +; VI-NEXT: s_and_b32 s18, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; VI-NEXT: s_or_b32 s6, s18, s6 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v48i16_to_v48f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_pk_add_u16 v13, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_u16 v12, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_u16 v11, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_u16 v10, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_u16 v29, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_u16 v28, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_u16 v26, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_u16 v25, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_u16 v24, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_u16 v33, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pk_add_u16 v32, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_lshl_or_b32 v9, v23, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v8, v22, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v31, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: s_branch .LBB57_5 +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v29, s25 +; GFX9-NEXT: v_mov_b32_e32 v28, s24 +; GFX9-NEXT: v_mov_b32_e32 v27, s23 +; GFX9-NEXT: v_mov_b32_e32 v26, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s21 +; GFX9-NEXT: v_mov_b32_e32 v24, s20 +; GFX9-NEXT: v_mov_b32_e32 v33, s19 +; GFX9-NEXT: v_mov_b32_e32 v32, s18 +; GFX9-NEXT: v_mov_b32_e32 v31, s17 +; GFX9-NEXT: v_mov_b32_e32 v30, s16 +; GFX9-NEXT: v_mov_b32_e32 v34, s43 +; GFX9-NEXT: v_mov_b32_e32 v35, s42 +; GFX9-NEXT: v_mov_b32_e32 v36, s41 +; GFX9-NEXT: v_mov_b32_e32 v37, s40 +; GFX9-NEXT: v_mov_b32_e32 v38, s15 +; GFX9-NEXT: v_mov_b32_e32 v39, s14 +; GFX9-NEXT: v_mov_b32_e32 v48, s13 +; GFX9-NEXT: v_mov_b32_e32 v49, s12 +; GFX9-NEXT: v_mov_b32_e32 v50, s11 +; GFX9-NEXT: v_mov_b32_e32 v51, s10 +; GFX9-NEXT: v_mov_b32_e32 v52, s9 +; GFX9-NEXT: v_mov_b32_e32 v53, s8 +; GFX9-NEXT: v_mov_b32_e32 v54, s7 +; GFX9-NEXT: v_mov_b32_e32 v55, s6 +; GFX9-NEXT: .LBB57_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v30, v55, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v31, v54, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v32, v53, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v33, v52, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v51, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v50, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v49, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v48, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v39, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v38, 16, v29 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v37, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v36, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v35, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v34, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-NEXT: v_mov_b32_e32 v1, v31 +; GFX9-NEXT: v_mov_b32_e32 v2, v32 +; GFX9-NEXT: v_mov_b32_e32 v3, v33 +; GFX9-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-NEXT: v_mov_b32_e32 v5, v25 +; GFX9-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-NEXT: v_mov_b32_e32 v8, v28 +; GFX9-NEXT: v_mov_b32_e32 v9, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v48f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v7, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v8, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v6, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s14, s11 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s13, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s11, s7 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s10, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v50.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v51.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v52.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v53.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v55.l +; GFX11-TRUE16-NEXT: s_branch .LBB57_5 +; GFX11-TRUE16-NEXT: .LBB57_3: +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s9 +; GFX11-TRUE16-NEXT: .LBB57_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v33, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v49, 16, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v20, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v39, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v19, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v30, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v48, 16, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v37, 16, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v32, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v28 :: v_dual_mov_b32 v5, v29 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v48f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: s_branch .LBB57_5 +; GFX11-FAKE16-NEXT: .LBB57_3: +; GFX11-FAKE16-NEXT: s_branch .LBB57_2 +; GFX11-FAKE16-NEXT: .LBB57_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s29 :: v_dual_mov_b32 v15, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v17, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v10, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s21 :: v_dual_mov_b32 v6, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s17 :: v_dual_mov_b32 v29, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s3 :: v_dual_mov_b32 v25, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s1 :: v_dual_mov_b32 v27, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v31, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s43 :: v_dual_mov_b32 v33, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s41 :: v_dual_mov_b32 v35, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s15 :: v_dual_mov_b32 v37, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s13 :: v_dual_mov_b32 v39, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s11 :: v_dual_mov_b32 v49, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s9 :: v_dual_mov_b32 v51, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s6 :: v_dual_mov_b32 v53, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s5 +; GFX11-FAKE16-NEXT: .LBB57_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v22, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v49, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v48, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v37, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <48 x i16> %a, splat (i16 3) + %a2 = bitcast <48 x i16> %a1 to <48 x half> + br label %end + +cmp.false: + %a3 = bitcast <48 x i16> %a to <48 x half> + br label %end + +end: + %phi = phi <48 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x half> %phi +} + define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v48f16_to_v48i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_or_b32_e32 v53, v53, v31 -; GCN-NEXT: v_or_b32_e32 v51, v51, v32 -; GCN-NEXT: v_or_b32_e32 v49, v49, v54 -; GCN-NEXT: v_or_b32_e32 v39, v39, v52 -; GCN-NEXT: v_or_b32_e32 v37, v37, v50 -; GCN-NEXT: v_or_b32_e32 v36, v36, v48 -; GCN-NEXT: v_or_b32_e32 v35, v35, v38 -; GCN-NEXT: v_or_b32_e32 v28, v28, v30 -; GCN-NEXT: v_or_b32_e32 v26, v26, v29 -; GCN-NEXT: v_or_b32_e32 v24, v24, v27 -; GCN-NEXT: v_or_b32_e32 v22, v22, v25 -; GCN-NEXT: v_or_b32_e32 v21, v21, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v55 -; GCN-NEXT: v_or_b32_e32 v13, v13, v40 -; GCN-NEXT: v_or_b32_e32 v15, v15, v42 -; GCN-NEXT: v_or_b32_e32 v17, v17, v43 -; GCN-NEXT: v_or_b32_e32 v20, v20, v44 -; GCN-NEXT: v_or_b32_e32 v19, v19, v46 -; GCN-NEXT: v_or_b32_e32 v18, v18, v47 -; GCN-NEXT: v_or_b32_e32 v16, v16, v56 -; GCN-NEXT: v_or_b32_e32 v14, v14, v57 -; GCN-NEXT: v_or_b32_e32 v4, v4, v58 -; GCN-NEXT: v_or_b32_e32 v3, v3, v59 -; GCN-NEXT: v_or_b32_e32 v2, v2, v60 -; GCN-NEXT: v_alignbit_b32 v40, v2, v31, 16 -; GCN-NEXT: v_alignbit_b32 v55, v3, v32, 16 -; GCN-NEXT: v_alignbit_b32 v54, v4, v54, 16 -; GCN-NEXT: v_alignbit_b32 v52, v14, v52, 16 -; GCN-NEXT: v_alignbit_b32 v50, v16, v50, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v48, 16 -; GCN-NEXT: v_alignbit_b32 v38, v19, v38, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v30, 16 -; GCN-NEXT: v_alignbit_b32 v29, v17, v29, 16 -; GCN-NEXT: v_alignbit_b32 v27, v15, v27, 16 -; GCN-NEXT: v_alignbit_b32 v25, v13, v25, 16 -; GCN-NEXT: v_alignbit_b32 v23, v12, v23, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v53, v53, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v51, v51, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v49, v49, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v4, v4, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v39, v39, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v9, v14, v9 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v37, v37, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_or_b32_e32 v10, v16, v10 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v36, v36, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v11, v18, v11 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x5c, v0 -; GCN-NEXT: v_or_b32_e32 v35, v35, v38 -; GCN-NEXT: v_or_b32_e32 v6, v19, v6 -; GCN-NEXT: v_or_b32_e32 v19, v28, v30 -; GCN-NEXT: v_or_b32_e32 v5, v20, v5 -; GCN-NEXT: v_or_b32_e32 v20, v26, v29 -; GCN-NEXT: v_or_b32_e32 v17, v17, v34 -; GCN-NEXT: v_or_b32_e32 v24, v24, v27 -; GCN-NEXT: v_or_b32_e32 v15, v15, v33 -; GCN-NEXT: v_or_b32_e32 v22, v22, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v45 -; GCN-NEXT: v_or_b32_e32 v21, v21, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v41 -; GCN-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48f16_to_v48i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v57 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v58 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v59 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v61 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v62 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v63 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v34 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v53 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_mov_b32_e32 v52, v21 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v6, v7 +; SI-NEXT: v_mov_b32_e32 v7, v8 +; SI-NEXT: v_mov_b32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v58, v2 +; SI-NEXT: v_mov_b32_e32 v60, v50 +; SI-NEXT: v_mov_b32_e32 v50, v3 +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v29, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v58 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_or_b32_e32 v60, v30, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v26 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; SI-NEXT: v_or_b32_e32 v58, v33, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v40 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v52 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v54 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_or_b32_e32 v52, v35, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v25 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v57 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_or_b32_e32 v57, v33, v23 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v56, v24, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v34, v24, v47 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v38 +; SI-NEXT: v_or_b32_e32 v59, v2, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_or_b32_e32 v5, v5, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v36 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_or_b32_e32 v14, v14, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_or_b32_e32 v20, v20, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_or_b32_e32 v19, v19, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v35 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v37 +; SI-NEXT: v_or_b32_e32 v38, v33, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v42 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v37, v35, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v39 +; SI-NEXT: v_or_b32_e32 v51, v33, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v50, v24, v33 +; SI-NEXT: v_or_b32_e32 v8, v8, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v55 +; SI-NEXT: v_or_b32_e32 v21, v21, v45 +; SI-NEXT: v_or_b32_e32 v28, v28, v25 +; SI-NEXT: v_or_b32_e32 v27, v27, v46 +; SI-NEXT: v_alignbit_b32 v44, v50, v31, 16 +; SI-NEXT: v_alignbit_b32 v43, v51, v32, 16 +; SI-NEXT: v_alignbit_b32 v42, v37, v29, 16 +; SI-NEXT: v_alignbit_b32 v41, v38, v30, 16 +; SI-NEXT: v_alignbit_b32 v40, v17, v55, 16 +; SI-NEXT: v_alignbit_b32 v55, v19, v45, 16 +; SI-NEXT: v_alignbit_b32 v54, v20, v26, 16 +; SI-NEXT: v_alignbit_b32 v26, v14, v25, 16 +; SI-NEXT: v_alignbit_b32 v25, v10, v46, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v23, 16 +; SI-NEXT: v_alignbit_b32 v23, v5, v22, 16 +; SI-NEXT: v_alignbit_b32 v22, v59, v47, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v44 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v43 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v39 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v42 +; SI-NEXT: v_or_b32_e32 v8, v8, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v8, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v8, v29, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v40 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v55 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v54 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48f16_to_v48i16: ; VI: ; %bb.0: @@ -19317,7 +40674,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_add_f16_e32 v55, 0x200, v55 @@ -19367,7 +40724,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 ; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 ; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; VI-NEXT: v_or_b32_sdwa v4, v4, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -19450,7 +40807,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v23, v55, v23, s6 @@ -19526,7 +40883,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v23 -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v24, v0, s4 @@ -19563,7 +40920,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] @@ -19589,7 +40946,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19625,7 +40982,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v22, v55, v22, 0x5040100 @@ -19699,7 +41056,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v22 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v23 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: .LBB58_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v25, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v26, v1, 0x5040100 @@ -19742,3 +41099,1251 @@ end: %phi = phi <48 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <48 x i16> %phi } + +define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48f16_to_v48i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s29 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v41 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v53 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v10, v10, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v14, v14, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_or_b32_e32 v35, v35, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_or_b32_e32 v34, v34, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_or_b32_e32 v38, v38, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_or_b32_e32 v18, v18, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_or_b32_e32 v23, v23, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v53 +; SI-NEXT: v_or_b32_e32 v22, v22, v50 +; SI-NEXT: v_or_b32_e32 v25, v25, v30 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v41 +; SI-NEXT: v_or_b32_e32 v16, v16, v28 +; SI-NEXT: v_or_b32_e32 v48, v48, v54 +; SI-NEXT: v_or_b32_e32 v39, v39, v42 +; SI-NEXT: v_or_b32_e32 v32, v32, v52 +; SI-NEXT: v_or_b32_e32 v31, v31, v51 +; SI-NEXT: v_or_b32_e32 v15, v15, v43 +; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: v_or_b32_e32 v7, v7, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v44 +; SI-NEXT: v_alignbit_b32 v40, v22, v30, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v29, 16 +; SI-NEXT: v_alignbit_b32 v29, v18, v41, 16 +; SI-NEXT: v_alignbit_b32 v28, v49, v28, 16 +; SI-NEXT: v_alignbit_b32 v55, v38, v54, 16 +; SI-NEXT: v_alignbit_b32 v54, v34, v42, 16 +; SI-NEXT: v_alignbit_b32 v53, v35, v52, 16 +; SI-NEXT: v_alignbit_b32 v52, v14, v51, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v43, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v27, 16 +; SI-NEXT: v_alignbit_b32 v27, v5, v26, 16 +; SI-NEXT: v_alignbit_b32 v26, v2, v44, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v25, v25, v40 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v29 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v55 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v54 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v16, v12 +; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v12, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v52 +; SI-NEXT: v_or_b32_e32 v12, v12, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v12, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v51 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v8, v3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v48f16_to_v48i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v34, 0x200 +; VI-NEXT: v_add_f16_e32 v30, s16, v34 +; VI-NEXT: v_add_f16_e32 v55, s43, v34 +; VI-NEXT: v_add_f16_e32 v31, s17, v34 +; VI-NEXT: v_add_f16_e32 v54, s42, v34 +; VI-NEXT: v_add_f16_e32 v32, s18, v34 +; VI-NEXT: v_add_f16_e32 v53, s41, v34 +; VI-NEXT: v_add_f16_e32 v33, s19, v34 +; VI-NEXT: v_add_f16_e32 v52, s40, v34 +; VI-NEXT: v_add_f16_e32 v24, s20, v34 +; VI-NEXT: v_add_f16_e32 v51, s15, v34 +; VI-NEXT: v_add_f16_e32 v25, s21, v34 +; VI-NEXT: v_add_f16_e32 v50, s14, v34 +; VI-NEXT: v_add_f16_e32 v26, s22, v34 +; VI-NEXT: v_add_f16_e32 v49, s13, v34 +; VI-NEXT: v_add_f16_e32 v27, s23, v34 +; VI-NEXT: v_add_f16_e32 v48, s12, v34 +; VI-NEXT: v_add_f16_e32 v28, s24, v34 +; VI-NEXT: v_add_f16_e32 v39, s11, v34 +; VI-NEXT: v_add_f16_e32 v29, s25, v34 +; VI-NEXT: v_add_f16_e32 v38, s10, v34 +; VI-NEXT: v_add_f16_e32 v10, s26, v34 +; VI-NEXT: v_add_f16_e32 v37, s9, v34 +; VI-NEXT: v_add_f16_e32 v11, s27, v34 +; VI-NEXT: v_add_f16_e32 v36, s8, v34 +; VI-NEXT: v_add_f16_e32 v12, s28, v34 +; VI-NEXT: v_add_f16_e32 v35, s7, v34 +; VI-NEXT: v_add_f16_e32 v13, s29, v34 +; VI-NEXT: v_add_f16_e32 v34, s6, v34 +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: s_branch .LBB59_5 +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v34, s6 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v35, s7 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v36, s8 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v37, s9 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v38, s10 +; VI-NEXT: v_mov_b32_e32 v29, s25 +; VI-NEXT: v_mov_b32_e32 v39, s11 +; VI-NEXT: v_mov_b32_e32 v28, s24 +; VI-NEXT: v_mov_b32_e32 v48, s12 +; VI-NEXT: v_mov_b32_e32 v27, s23 +; VI-NEXT: v_mov_b32_e32 v49, s13 +; VI-NEXT: v_mov_b32_e32 v26, s22 +; VI-NEXT: v_mov_b32_e32 v50, s14 +; VI-NEXT: v_mov_b32_e32 v25, s21 +; VI-NEXT: v_mov_b32_e32 v51, s15 +; VI-NEXT: v_mov_b32_e32 v24, s20 +; VI-NEXT: v_mov_b32_e32 v52, s40 +; VI-NEXT: v_mov_b32_e32 v33, s19 +; VI-NEXT: v_mov_b32_e32 v53, s41 +; VI-NEXT: v_mov_b32_e32 v32, s18 +; VI-NEXT: v_mov_b32_e32 v54, s42 +; VI-NEXT: v_mov_b32_e32 v31, s17 +; VI-NEXT: v_mov_b32_e32 v55, s43 +; VI-NEXT: v_mov_b32_e32 v30, s16 +; VI-NEXT: .LBB59_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_or_b32_sdwa v30, v30, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v33, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_or_b32_sdwa v10, v10, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v30 +; VI-NEXT: v_mov_b32_e32 v1, v31 +; VI-NEXT: v_mov_b32_e32 v2, v32 +; VI-NEXT: v_mov_b32_e32 v3, v33 +; VI-NEXT: v_mov_b32_e32 v4, v24 +; VI-NEXT: v_mov_b32_e32 v5, v25 +; VI-NEXT: v_mov_b32_e32 v6, v26 +; VI-NEXT: v_mov_b32_e32 v7, v27 +; VI-NEXT: v_mov_b32_e32 v8, v28 +; VI-NEXT: v_mov_b32_e32 v9, v29 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v48f16_to_v48i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v9, v23, 16, v9 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v8, v22, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_mov_b32_e32 v14, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_f16 v12, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_f16 v11, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_f16 v10, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_f16 v29, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_f16 v28, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_f16 v27, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_f16 v26, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_f16 v25, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_f16 v24, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_f16 v33, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_pk_add_f16 v32, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_pk_add_f16 v31, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_f16 v30, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: s_branch .LBB59_5 +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v29, s25 +; GFX9-NEXT: v_mov_b32_e32 v28, s24 +; GFX9-NEXT: v_mov_b32_e32 v27, s23 +; GFX9-NEXT: v_mov_b32_e32 v26, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s21 +; GFX9-NEXT: v_mov_b32_e32 v24, s20 +; GFX9-NEXT: v_mov_b32_e32 v33, s19 +; GFX9-NEXT: v_mov_b32_e32 v32, s18 +; GFX9-NEXT: v_mov_b32_e32 v31, s17 +; GFX9-NEXT: v_mov_b32_e32 v30, s16 +; GFX9-NEXT: v_mov_b32_e32 v34, s43 +; GFX9-NEXT: v_mov_b32_e32 v35, s42 +; GFX9-NEXT: v_mov_b32_e32 v36, s41 +; GFX9-NEXT: v_mov_b32_e32 v37, s40 +; GFX9-NEXT: v_mov_b32_e32 v38, s15 +; GFX9-NEXT: v_mov_b32_e32 v39, s14 +; GFX9-NEXT: v_mov_b32_e32 v48, s13 +; GFX9-NEXT: v_mov_b32_e32 v49, s12 +; GFX9-NEXT: v_mov_b32_e32 v50, s11 +; GFX9-NEXT: v_mov_b32_e32 v51, s10 +; GFX9-NEXT: v_mov_b32_e32 v52, s9 +; GFX9-NEXT: v_mov_b32_e32 v53, s8 +; GFX9-NEXT: v_mov_b32_e32 v54, s7 +; GFX9-NEXT: v_mov_b32_e32 v55, s6 +; GFX9-NEXT: .LBB59_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v30, v55, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v31, v54, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v32, v53, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v33, v52, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v51, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v50, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v49, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v48, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v39, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v38, 16, v29 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v37, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v36, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v35, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v34, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-NEXT: v_mov_b32_e32 v1, v31 +; GFX9-NEXT: v_mov_b32_e32 v2, v32 +; GFX9-NEXT: v_mov_b32_e32 v3, v33 +; GFX9-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-NEXT: v_mov_b32_e32 v5, v25 +; GFX9-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-NEXT: v_mov_b32_e32 v8, v28 +; GFX9-NEXT: v_mov_b32_e32 v9, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v48i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v7, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v8, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v6, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s14, s11 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s13, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s11, s7 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s10, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v50.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v51.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v52.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v53.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v55.l +; GFX11-TRUE16-NEXT: s_branch .LBB59_5 +; GFX11-TRUE16-NEXT: .LBB59_3: +; GFX11-TRUE16-NEXT: s_branch .LBB59_2 +; GFX11-TRUE16-NEXT: .LBB59_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s9 +; GFX11-TRUE16-NEXT: .LBB59_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v33, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v49, 16, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v20, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v39, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v19, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v30, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v48, 16, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v37, 16, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v32, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v28 :: v_dual_mov_b32 v5, v29 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v48i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: s_branch .LBB59_5 +; GFX11-FAKE16-NEXT: .LBB59_3: +; GFX11-FAKE16-NEXT: s_branch .LBB59_2 +; GFX11-FAKE16-NEXT: .LBB59_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s29 :: v_dual_mov_b32 v15, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v17, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v10, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s21 :: v_dual_mov_b32 v6, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s17 :: v_dual_mov_b32 v29, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s3 :: v_dual_mov_b32 v25, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s1 :: v_dual_mov_b32 v27, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v31, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s43 :: v_dual_mov_b32 v33, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s41 :: v_dual_mov_b32 v35, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s15 :: v_dual_mov_b32 v37, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s13 :: v_dual_mov_b32 v39, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s11 :: v_dual_mov_b32 v49, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s9 :: v_dual_mov_b32 v51, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s6 :: v_dual_mov_b32 v53, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s5 +; GFX11-FAKE16-NEXT: .LBB59_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v22, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v49, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v48, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v37, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <48 x half> %a, splat (half 0xH0200) + %a2 = bitcast <48 x half> %a1 to <48 x i16> + br label %end + +cmp.false: + %a3 = bitcast <48 x half> %a to <48 x i16> + br label %end + +end: + %phi = phi <48 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x i16> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index 75baa36ca3d11..eac4794012a9f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -1,50 +1,50 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <26 x float> @bitcast_v26i32_to_v26f32(<26 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v26i32_to_v26f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v26i32_to_v26f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26i32_to_v26f32: ; VI: ; %bb.0: @@ -180,45 +180,317 @@ end: ret <26 x float> %phi } +define inreg <26 x float> @bitcast_v26i32_to_v26f32_scalar(<26 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26i32_to_v26f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v26i32_to_v26f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v26i32_to_v26f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v26i32_to_v26f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_3: +; GFX11-NEXT: .LBB1_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <26 x i32> %a, splat (i32 3) + %a2 = bitcast <26 x i32> %a1 to <26 x float> + br label %end + +cmp.false: + %a3 = bitcast <26 x i32> %a to <26 x float> + br label %end + +end: + %phi = phi <26 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x float> %phi +} + define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v26f32_to_v26i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v26f32_to_v26i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26f32_to_v26i32: ; VI: ; %bb.0: @@ -227,7 +499,7 @@ define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -255,7 +527,7 @@ define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -266,7 +538,7 @@ define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 ; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -294,7 +566,7 @@ define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -306,7 +578,7 @@ define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 ; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 @@ -321,7 +593,7 @@ define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -341,45 +613,304 @@ end: ret <26 x i32> %phi } +define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26f32_to_v26i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v26f32_to_v26i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v26f32_to_v26i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v26f32_to_v26i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB3_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: .LBB3_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <26 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <26 x float> %a1 to <26 x i32> + br label %end + +cmp.false: + %a3 = bitcast <26 x float> %a to <26 x i32> + br label %end + +end: + %phi = phi <26 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x i32> %phi +} + define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v26i32_to_v13i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v26i32_to_v13i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26i32_to_v13i64: ; VI: ; %bb.0: @@ -388,7 +919,7 @@ define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 @@ -416,7 +947,7 @@ define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -427,7 +958,7 @@ define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 ; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 @@ -455,7 +986,7 @@ define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -467,7 +998,7 @@ define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 ; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 @@ -495,7 +1026,7 @@ define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -515,82 +1046,354 @@ end: ret <13 x i64> %phi } -define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v13i64_to_v26i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <13 x i64> @bitcast_v26i32_to_v13i64_scalar(<26 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26i32_to_v13i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 ; -; VI-LABEL: bitcast_v13i64_to_v26i32: +; VI-LABEL: bitcast_v26i32_to_v13i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v26i32_to_v13i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v26i32_to_v13i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_3: +; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <26 x i32> %a, splat (i32 3) + %a2 = bitcast <26 x i32> %a1 to <13 x i64> + br label %end + +cmp.false: + %a3 = bitcast <26 x i32> %a to <13 x i64> + br label %end + +end: + %phi = phi <13 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x i64> %phi +} + +define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v13i64_to_v26i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v13i64_to_v26i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -601,7 +1404,7 @@ define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc @@ -629,7 +1432,7 @@ define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -641,7 +1444,7 @@ define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -676,7 +1479,7 @@ define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -696,45 +1499,324 @@ end: ret <26 x i32> %phi } +define inreg <26 x i32> @bitcast_v13i64_to_v26i32_scalar(<13 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13i64_to_v26i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v13i64_to_v26i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v13i64_to_v26i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v13i64_to_v26i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB7_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: .LBB7_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <13 x i64> %a, splat (i64 3) + %a2 = bitcast <13 x i64> %a1 to <26 x i32> + br label %end + +cmp.false: + %a3 = bitcast <13 x i64> %a to <26 x i32> + br label %end + +end: + %phi = phi <26 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x i32> %phi +} + define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v26i32_to_v13f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v26i32_to_v13f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26i32_to_v13f64: ; VI: ; %bb.0: @@ -743,7 +1825,7 @@ define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 @@ -771,7 +1853,7 @@ define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -782,7 +1864,7 @@ define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 ; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 @@ -810,7 +1892,7 @@ define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -822,7 +1904,7 @@ define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 ; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 @@ -850,7 +1932,7 @@ define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -870,56 +1952,328 @@ end: ret <13 x double> %phi } -define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v13f64_to_v26i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <13 x double> @bitcast_v26i32_to_v13f64_scalar(<26 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26i32_to_v13f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 ; -; VI-LABEL: bitcast_v13f64_to_v26i32: +; VI-LABEL: bitcast_v26i32_to_v13f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v26i32_to_v13f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v26i32_to_v13f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB9_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: .LBB9_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <26 x i32> %a, splat (i32 3) + %a2 = bitcast <26 x i32> %a1 to <13 x double> + br label %end + +cmp.false: + %a3 = bitcast <26 x i32> %a to <13 x double> + br label %end + +end: + %phi = phi <13 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x double> %phi +} + +define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v13f64_to_v26i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v13f64_to_v26i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -930,7 +2284,7 @@ define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -945,7 +2299,7 @@ define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -957,7 +2311,7 @@ define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -972,7 +2326,7 @@ define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -992,287 +2346,522 @@ end: ret <26 x i32> %phi } +define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13f64_to_v26i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v13f64_to_v26i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v13f64_to_v26i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v13f64_to_v26i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB11_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: .LBB11_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <13 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <13 x double> %a1 to <26 x i32> + br label %end + +cmp.false: + %a3 = bitcast <13 x double> %a to <26 x i32> + br label %end + +end: + %phi = phi <26 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x i32> %phi +} + define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v26i32_to_v52i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v32, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v34, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v38, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v50, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v32, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v34, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v38, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v50, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_and_b32_e32 v47, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v40, v45, v40 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v44, v46, v44 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v54, v47, v54 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v43, v56, v43 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v52, v57, v52 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v7, v7, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v8, v8, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v9, v9, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v10, v10, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v11, v11, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v12, v12, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v13, v13, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v14, v14, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v15, v15, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v16, v16, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v17, v17, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x64, v0 -; GCN-NEXT: v_or_b32_e32 v19, v19, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v37 -; GCN-NEXT: v_or_b32_e32 v21, v21, v29 -; GCN-NEXT: v_or_b32_e32 v22, v22, v35 -; GCN-NEXT: v_or_b32_e32 v23, v23, v28 -; GCN-NEXT: v_or_b32_e32 v24, v24, v33 -; GCN-NEXT: v_or_b32_e32 v25, v25, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v31 -; GCN-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v26i32_to_v52i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v48, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v50, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v48, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v50, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26i32_to_v52i16: ; VI: ; %bb.0: @@ -1310,7 +2899,7 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -1338,9 +2927,9 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB6_2: ; %Flow +; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_4 +; VI-NEXT: s_cbranch_execz .LBB12_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 @@ -1394,7 +2983,7 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB6_4: ; %end +; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 @@ -1491,7 +3080,7 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -1519,9 +3108,9 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB6_2: ; %Flow +; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 ; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 @@ -1575,7 +3164,7 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB6_4: ; %end +; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v43, v0, s4 @@ -1619,7 +3208,7 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 @@ -1647,7 +3236,7 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: .LBB12_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1684,7 +3273,7 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -1712,9 +3301,9 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 @@ -1768,7 +3357,7 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 @@ -1815,470 +3404,1534 @@ end: ret <52 x i16> %phi } +define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26i32_to_v52i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_readfirstlane_b32 s41, v1 +; SI-NEXT: v_readfirstlane_b32 s40, v2 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_readfirstlane_b32 s14, v4 +; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_readfirstlane_b32 s9, v9 +; SI-NEXT: v_readfirstlane_b32 s8, v10 +; SI-NEXT: v_readfirstlane_b32 s7, v11 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s28 +; SI-NEXT: v_mov_b32_e32 v8, s26 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s29, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s27, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s25, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s23, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s21, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s17, v13, 16 +; SI-NEXT: s_lshr_b32 s42, s6, 16 +; SI-NEXT: s_lshr_b32 s43, s8, 16 +; SI-NEXT: s_lshr_b32 s44, s10, 16 +; SI-NEXT: s_lshr_b32 s45, s12, 16 +; SI-NEXT: s_lshr_b32 s46, s14, 16 +; SI-NEXT: s_lshr_b32 s47, s40, 16 +; SI-NEXT: s_lshr_b32 s56, s29, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_lshr_b32 s58, s25, 16 +; SI-NEXT: s_lshr_b32 s59, s23, 16 +; SI-NEXT: s_lshr_b32 s60, s21, 16 +; SI-NEXT: s_lshr_b32 s61, s19, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s28 +; SI-NEXT: v_mov_b32_e32 v8, s26 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s29, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s27, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s25, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s23, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s21, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s17, v13, 16 +; SI-NEXT: s_lshr_b32 s42, s6, 16 +; SI-NEXT: s_lshr_b32 s43, s8, 16 +; SI-NEXT: s_lshr_b32 s44, s10, 16 +; SI-NEXT: s_lshr_b32 s45, s12, 16 +; SI-NEXT: s_lshr_b32 s46, s14, 16 +; SI-NEXT: s_lshr_b32 s47, s40, 16 +; SI-NEXT: s_lshr_b32 s56, s29, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_lshr_b32 s58, s25, 16 +; SI-NEXT: s_lshr_b32 s59, s23, 16 +; SI-NEXT: s_lshr_b32 s60, s21, 16 +; SI-NEXT: s_lshr_b32 s61, s19, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, s4, v13 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v26i32_to_v52i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_readfirstlane_b32 s41, v0 +; VI-NEXT: v_readfirstlane_b32 s40, v1 +; VI-NEXT: v_readfirstlane_b32 s15, v2 +; VI-NEXT: v_readfirstlane_b32 s14, v3 +; VI-NEXT: v_readfirstlane_b32 s13, v4 +; VI-NEXT: v_readfirstlane_b32 s12, v5 +; VI-NEXT: v_readfirstlane_b32 s11, v6 +; VI-NEXT: v_readfirstlane_b32 s10, v7 +; VI-NEXT: v_readfirstlane_b32 s9, v8 +; VI-NEXT: v_readfirstlane_b32 s8, v9 +; VI-NEXT: v_readfirstlane_b32 s6, v10 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v11 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s42, s7, 16 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s45, s9, 16 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: s_lshr_b32 s56, s12, 16 +; VI-NEXT: s_lshr_b32 s57, s13, 16 +; VI-NEXT: s_lshr_b32 s58, s14, 16 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s40, 16 +; VI-NEXT: s_lshr_b32 s61, s41, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 16 +; VI-NEXT: s_lshr_b32 s73, s26, 16 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s24, 16 +; VI-NEXT: s_lshr_b32 s76, s23, 16 +; VI-NEXT: s_lshr_b32 s77, s22, 16 +; VI-NEXT: s_lshr_b32 s78, s21, 16 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s19, 16 +; VI-NEXT: s_lshr_b32 s89, s18, 16 +; VI-NEXT: s_lshr_b32 s90, s17, 16 +; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s42, s7, 16 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s45, s9, 16 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: s_lshr_b32 s56, s12, 16 +; VI-NEXT: s_lshr_b32 s57, s13, 16 +; VI-NEXT: s_lshr_b32 s58, s14, 16 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s40, 16 +; VI-NEXT: s_lshr_b32 s61, s41, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 16 +; VI-NEXT: s_lshr_b32 s73, s26, 16 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s24, 16 +; VI-NEXT: s_lshr_b32 s76, s23, 16 +; VI-NEXT: s_lshr_b32 s77, s22, 16 +; VI-NEXT: s_lshr_b32 s78, s21, 16 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s19, 16 +; VI-NEXT: s_lshr_b32 s89, s18, 16 +; VI-NEXT: s_lshr_b32 s90, s17, 16 +; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s91, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s90, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s89, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s88, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s79, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s78, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s77, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s76, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s75, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s74, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s73, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s72, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s63, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s62, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s29, s61, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s40, s60, 16 +; VI-NEXT: s_or_b32 s29, s29, s40 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s40, s59, 16 +; VI-NEXT: s_or_b32 s15, s15, s40 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s40, s58, 16 +; VI-NEXT: s_or_b32 s14, s14, s40 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s40, s57, 16 +; VI-NEXT: s_or_b32 s13, s13, s40 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s40, s56, 16 +; VI-NEXT: s_or_b32 s12, s12, s40 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s40, s47, 16 +; VI-NEXT: s_or_b32 s11, s11, s40 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s40, s46, 16 +; VI-NEXT: s_or_b32 s10, s10, s40 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s40, s45, 16 +; VI-NEXT: s_or_b32 s9, s9, s40 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s40, s44, 16 +; VI-NEXT: s_or_b32 s8, s8, s40 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s40, s43, 16 +; VI-NEXT: s_or_b32 s6, s6, s40 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s40, s42, 16 +; VI-NEXT: s_or_b32 s7, s7, s40 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s15 +; VI-NEXT: v_mov_b32_e32 v17, s14 +; VI-NEXT: v_mov_b32_e32 v18, s13 +; VI-NEXT: v_mov_b32_e32 v19, s12 +; VI-NEXT: v_mov_b32_e32 v20, s11 +; VI-NEXT: v_mov_b32_e32 v21, s10 +; VI-NEXT: v_mov_b32_e32 v22, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: v_mov_b32_e32 v25, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v26i32_to_v52i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s42, s41, 16 +; GFX9-NEXT: s_lshr_b32 s43, s40, 16 +; GFX9-NEXT: s_lshr_b32 s44, s15, 16 +; GFX9-NEXT: s_lshr_b32 s45, s14, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_lshr_b32 s47, s12, 16 +; GFX9-NEXT: s_lshr_b32 s56, s11, 16 +; GFX9-NEXT: s_lshr_b32 s57, s10, 16 +; GFX9-NEXT: s_lshr_b32 s58, s9, 16 +; GFX9-NEXT: s_lshr_b32 s59, s8, 16 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s6, 16 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s26, 16 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s24, 16 +; GFX9-NEXT: s_lshr_b32 s76, s23, 16 +; GFX9-NEXT: s_lshr_b32 s77, s22, 16 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_lshr_b32 s88, s19, 16 +; GFX9-NEXT: s_lshr_b32 s89, s18, 16 +; GFX9-NEXT: s_lshr_b32 s90, s17, 16 +; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s41, s41, 3 +; GFX9-NEXT: s_add_i32 s40, s40, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s42, s41, 16 +; GFX9-NEXT: s_lshr_b32 s43, s40, 16 +; GFX9-NEXT: s_lshr_b32 s44, s15, 16 +; GFX9-NEXT: s_lshr_b32 s45, s14, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_lshr_b32 s47, s12, 16 +; GFX9-NEXT: s_lshr_b32 s56, s11, 16 +; GFX9-NEXT: s_lshr_b32 s57, s10, 16 +; GFX9-NEXT: s_lshr_b32 s58, s9, 16 +; GFX9-NEXT: s_lshr_b32 s59, s8, 16 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s6, 16 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s26, 16 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s24, 16 +; GFX9-NEXT: s_lshr_b32 s76, s23, 16 +; GFX9-NEXT: s_lshr_b32 s77, s22, 16 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_lshr_b32 s88, s19, 16 +; GFX9-NEXT: s_lshr_b32 s89, s18, 16 +; GFX9-NEXT: s_lshr_b32 s90, s17, 16 +; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s42 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-TRUE16-LABEL: bitcast_v26i32_to_v52i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v7 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s11 :: v_dual_mov_b32 v19, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s9 :: v_dual_mov_b32 v21, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s7 :: v_dual_mov_b32 v23, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v25, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 +; +; GFX11-FAKE16-LABEL: bitcast_v26i32_to_v52i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: s_mov_b32 s78, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v25, s10 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB13_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <26 x i32> %a, splat (i32 3) + %a2 = bitcast <26 x i32> %a1 to <52 x i16> + br label %end + +cmp.false: + %a3 = bitcast <26 x i32> %a to <52 x i16> + br label %end + +end: + %phi = phi <52 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x i16> %phi +} + define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v52i16_to_v26i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v62 -; GCN-NEXT: v_or_b32_e32 v1, v1, v63 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v59 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v58 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v57 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v56 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v47 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v46 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v44 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v43 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v11, v11, v42 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v60 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v45 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v26 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: .LBB7_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v45 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v0, v62, v0 -; GCN-NEXT: v_or_b32_e32 v1, v63, v1 -; GCN-NEXT: v_or_b32_e32 v2, v61, v2 -; GCN-NEXT: v_or_b32_e32 v3, v59, v3 -; GCN-NEXT: v_or_b32_e32 v4, v58, v4 -; GCN-NEXT: v_or_b32_e32 v5, v57, v5 -; GCN-NEXT: v_or_b32_e32 v6, v56, v6 -; GCN-NEXT: v_or_b32_e32 v7, v47, v7 -; GCN-NEXT: v_or_b32_e32 v8, v46, v8 -; GCN-NEXT: v_or_b32_e32 v9, v44, v9 -; GCN-NEXT: v_or_b32_e32 v10, v43, v10 -; GCN-NEXT: v_or_b32_e32 v11, v42, v11 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v26, v12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v15 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v26, v16 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v26, v17 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v26, v18 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v26, v19 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v26, v20 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v26, v21 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v26, v22 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v26, v23 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v26, v25 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 -; GCN-NEXT: .LBB7_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v52i16_to_v26i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v56 +; SI-NEXT: v_or_b32_e32 v5, v5, v47 +; SI-NEXT: v_or_b32_e32 v6, v6, v38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v46 +; SI-NEXT: v_or_b32_e32 v10, v10, v35 +; SI-NEXT: v_or_b32_e32 v11, v11, v45 +; SI-NEXT: v_or_b32_e32 v12, v12, v44 +; SI-NEXT: v_or_b32_e32 v13, v13, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v43 +; SI-NEXT: v_or_b32_e32 v15, v15, v42 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_or_b32_e32 v17, v17, v41 +; SI-NEXT: v_or_b32_e32 v18, v18, v40 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v20, v20, v63 +; SI-NEXT: v_or_b32_e32 v21, v21, v62 +; SI-NEXT: v_or_b32_e32 v22, v22, v61 +; SI-NEXT: v_or_b32_e32 v23, v23, v60 +; SI-NEXT: v_or_b32_e32 v24, v24, v59 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_or_b32_e32 v5, v47, v5 +; SI-NEXT: v_or_b32_e32 v6, v38, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v36, v8 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v10, v35, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v44, v12 +; SI-NEXT: v_or_b32_e32 v13, v34, v13 +; SI-NEXT: v_or_b32_e32 v14, v43, v14 +; SI-NEXT: v_or_b32_e32 v15, v42, v15 +; SI-NEXT: v_or_b32_e32 v16, v33, v16 +; SI-NEXT: v_or_b32_e32 v17, v41, v17 +; SI-NEXT: v_or_b32_e32 v18, v40, v18 +; SI-NEXT: v_or_b32_e32 v19, v32, v19 +; SI-NEXT: v_or_b32_e32 v20, v63, v20 +; SI-NEXT: v_or_b32_e32 v21, v62, v21 +; SI-NEXT: v_or_b32_e32 v22, v61, v22 +; SI-NEXT: v_or_b32_e32 v23, v60, v23 +; SI-NEXT: v_or_b32_e32 v24, v59, v24 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52i16_to_v26i32: ; VI: ; %bb.0: @@ -2323,7 +4976,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v25, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2404,9 +5057,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_4 +; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v25, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v57 @@ -2487,7 +5140,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v26, 3, v32 ; VI-NEXT: v_add_u16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: .LBB7_4: ; %end +; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2594,7 +5247,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -2718,9 +5371,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; kill: killed $vgpr26 -; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -2812,7 +5465,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_4: ; %end +; GFX9-NEXT: .LBB14_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2841,7 +5494,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2869,7 +5522,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2933,7 +5586,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2961,7 +5614,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2981,593 +5634,1697 @@ end: ret <26 x i32> %phi } -define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v26i32_to_v52f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v51 -; GCN-NEXT: v_mov_b32_e32 v51, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v53 -; GCN-NEXT: v_mov_b32_e32 v53, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v55 -; GCN-NEXT: v_mov_b32_e32 v55, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_mov_b32_e32 v41, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v52 -; GCN-NEXT: v_mov_b32_e32 v52, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v54 -; GCN-NEXT: v_mov_b32_e32 v54, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v43 -; GCN-NEXT: v_mov_b32_e32 v43, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v45 -; GCN-NEXT: v_mov_b32_e32 v45, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v47 -; GCN-NEXT: v_mov_b32_e32 v47, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v28 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v27 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v62 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v60 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v58 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v32 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v31 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v29 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v63 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v61 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v59 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v56 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v55 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v53 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v51 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v41 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_or_b32_e32 v53, v55, v54 -; GCN-NEXT: v_or_b32_e32 v54, v41, v40 -; GCN-NEXT: v_or_b32_e32 v55, v43, v42 -; GCN-NEXT: v_or_b32_e32 v40, v45, v44 -; GCN-NEXT: v_or_b32_e32 v41, v47, v46 -; GCN-NEXT: v_or_b32_e32 v42, v57, v56 -; GCN-NEXT: v_or_b32_e32 v43, v59, v58 -; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52i16_to_v26i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: v_mov_b32_e32 v33, v24 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v24, v0, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v62 +; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v57 +; SI-NEXT: v_mov_b32_e32 v57, v32 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v50, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v40 +; SI-NEXT: v_mov_b32_e32 v40, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v49, v38 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v43 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v27 +; SI-NEXT: v_mov_b32_e32 v44, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_mov_b32_e32 v43, v62 +; SI-NEXT: v_mov_b32_e32 v62, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v31, v62 +; SI-NEXT: v_mov_b32_e32 v62, v43 +; SI-NEXT: v_mov_b32_e32 v29, v60 +; SI-NEXT: v_mov_b32_e32 v60, v44 +; SI-NEXT: v_mov_b32_e32 v27, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v43, v46 +; SI-NEXT: v_mov_b32_e32 v44, v47 +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v26, v35 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v49 +; SI-NEXT: v_mov_b32_e32 v49, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v48 +; SI-NEXT: v_mov_b32_e32 v48, v40 +; SI-NEXT: v_mov_b32_e32 v40, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v33, v36 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v50 +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: v_mov_b32_e32 v57, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB15_2 ; -; VI-LABEL: bitcast_v26i32_to_v52f16: +; VI-LABEL: bitcast_v52i16_to_v26i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v39, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v35, v9 +; VI-NEXT: v_mov_b32_e32 v34, v8 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v38, v4 +; VI-NEXT: v_mov_b32_e32 v48, v3 +; VI-NEXT: v_mov_b32_e32 v49, v2 +; VI-NEXT: v_mov_b32_e32 v51, v1 +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: v_or_b32_sdwa v14, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_or_b32_sdwa v18, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v20, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v21, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v22, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v50 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v52i16_to_v26i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v11 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v34, v9 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v36, v7 +; GFX9-NEXT: v_mov_b32_e32 v37, v6 +; GFX9-NEXT: v_mov_b32_e32 v38, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v4 +; GFX9-NEXT: v_mov_b32_e32 v48, v3 +; GFX9-NEXT: v_mov_b32_e32 v49, v2 +; GFX9-NEXT: v_mov_b32_e32 v50, v1 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v25 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v26i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v7 :: v_dual_mov_b32 v33, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v3 :: v_dual_mov_b32 v37, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v39, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v39 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v16, 16, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; +; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v26i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <52 x i16> %a, splat (i16 3) + %a2 = bitcast <52 x i16> %a1 to <26 x i32> + br label %end + +cmp.false: + %a3 = bitcast <52 x i16> %a to <26 x i32> + br label %end + +end: + %phi = phi <26 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x i32> %phi +} + +define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v26i32_to_v52f16: +; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v27 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_mov_b32_e32 v55, v24 +; SI-NEXT: v_mov_b32_e32 v53, v25 +; SI-NEXT: v_mov_b32_e32 v51, v26 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v26i32_to_v52f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 @@ -3603,7 +7360,7 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -3631,9 +7388,9 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB8_2: ; %Flow +; VI-NEXT: .LBB16_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_4 +; VI-NEXT: s_cbranch_execz .LBB16_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 @@ -3687,7 +7444,7 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB8_4: ; %end +; VI-NEXT: .LBB16_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 @@ -3784,7 +7541,7 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -3812,9 +7569,9 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB8_2: ; %Flow +; GFX9-NEXT: .LBB16_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: s_cbranch_execz .LBB16_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 ; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 @@ -3868,7 +7625,7 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB8_4: ; %end +; GFX9-NEXT: .LBB16_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v43, v0, s4 @@ -3912,7 +7669,7 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 @@ -3940,7 +7697,7 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3977,7 +7734,7 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -4005,9 +7762,9 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 @@ -4061,7 +7818,7 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 @@ -4108,633 +7865,1862 @@ end: ret <52 x half> %phi } +define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26i32_to_v52f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_readfirstlane_b32 s41, v1 +; SI-NEXT: v_readfirstlane_b32 s40, v2 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_readfirstlane_b32 s14, v4 +; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_readfirstlane_b32 s8, v9 +; SI-NEXT: v_readfirstlane_b32 s7, v10 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v12 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s43, s19, 16 +; SI-NEXT: s_lshr_b32 s44, s20, 16 +; SI-NEXT: s_lshr_b32 s45, s21, 16 +; SI-NEXT: s_lshr_b32 s46, s22, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s56, s24, 16 +; SI-NEXT: s_lshr_b32 s57, s25, 16 +; SI-NEXT: s_lshr_b32 s58, s26, 16 +; SI-NEXT: s_lshr_b32 s59, s27, 16 +; SI-NEXT: s_lshr_b32 s60, s28, 16 +; SI-NEXT: s_lshr_b32 s61, s29, 16 +; SI-NEXT: s_lshr_b32 s62, s41, 16 +; SI-NEXT: s_lshr_b32 s63, s40, 16 +; SI-NEXT: s_lshr_b32 s72, s15, 16 +; SI-NEXT: s_lshr_b32 s73, s14, 16 +; SI-NEXT: s_lshr_b32 s74, s13, 16 +; SI-NEXT: s_lshr_b32 s75, s12, 16 +; SI-NEXT: s_lshr_b32 s76, s11, 16 +; SI-NEXT: s_lshr_b32 s77, s10, 16 +; SI-NEXT: s_lshr_b32 s78, s8, 16 +; SI-NEXT: s_lshr_b32 s79, s7, 16 +; SI-NEXT: s_lshr_b32 s88, s6, 16 +; SI-NEXT: s_lshr_b32 s89, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v43, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_or_b32_e32 v43, v43, v44 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v43, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v41, v41, v42 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v41, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v55, v55, v40 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v53, v53, v54 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v51, v52, v51 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v49, v50, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v26i32_to_v52f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_readfirstlane_b32 s41, v0 +; VI-NEXT: v_readfirstlane_b32 s40, v1 +; VI-NEXT: v_readfirstlane_b32 s15, v2 +; VI-NEXT: v_readfirstlane_b32 s14, v3 +; VI-NEXT: v_readfirstlane_b32 s13, v4 +; VI-NEXT: v_readfirstlane_b32 s12, v5 +; VI-NEXT: v_readfirstlane_b32 s11, v6 +; VI-NEXT: v_readfirstlane_b32 s10, v7 +; VI-NEXT: v_readfirstlane_b32 s9, v8 +; VI-NEXT: v_readfirstlane_b32 s8, v9 +; VI-NEXT: v_readfirstlane_b32 s6, v10 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v11 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s42, s7, 16 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s45, s9, 16 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: s_lshr_b32 s56, s12, 16 +; VI-NEXT: s_lshr_b32 s57, s13, 16 +; VI-NEXT: s_lshr_b32 s58, s14, 16 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s40, 16 +; VI-NEXT: s_lshr_b32 s61, s41, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 16 +; VI-NEXT: s_lshr_b32 s73, s26, 16 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s24, 16 +; VI-NEXT: s_lshr_b32 s76, s23, 16 +; VI-NEXT: s_lshr_b32 s77, s22, 16 +; VI-NEXT: s_lshr_b32 s78, s21, 16 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s19, 16 +; VI-NEXT: s_lshr_b32 s89, s18, 16 +; VI-NEXT: s_lshr_b32 s90, s17, 16 +; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s42, s7, 16 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s45, s9, 16 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: s_lshr_b32 s56, s12, 16 +; VI-NEXT: s_lshr_b32 s57, s13, 16 +; VI-NEXT: s_lshr_b32 s58, s14, 16 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s40, 16 +; VI-NEXT: s_lshr_b32 s61, s41, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 16 +; VI-NEXT: s_lshr_b32 s73, s26, 16 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s24, 16 +; VI-NEXT: s_lshr_b32 s76, s23, 16 +; VI-NEXT: s_lshr_b32 s77, s22, 16 +; VI-NEXT: s_lshr_b32 s78, s21, 16 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s19, 16 +; VI-NEXT: s_lshr_b32 s89, s18, 16 +; VI-NEXT: s_lshr_b32 s90, s17, 16 +; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s91, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s90, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s89, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s88, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s79, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s78, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s77, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s76, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s75, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s74, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s73, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s72, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s63, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s62, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s29, s61, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s40, s60, 16 +; VI-NEXT: s_or_b32 s29, s29, s40 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s40, s59, 16 +; VI-NEXT: s_or_b32 s15, s15, s40 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s40, s58, 16 +; VI-NEXT: s_or_b32 s14, s14, s40 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s40, s57, 16 +; VI-NEXT: s_or_b32 s13, s13, s40 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s40, s56, 16 +; VI-NEXT: s_or_b32 s12, s12, s40 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s40, s47, 16 +; VI-NEXT: s_or_b32 s11, s11, s40 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s40, s46, 16 +; VI-NEXT: s_or_b32 s10, s10, s40 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s40, s45, 16 +; VI-NEXT: s_or_b32 s9, s9, s40 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s40, s44, 16 +; VI-NEXT: s_or_b32 s8, s8, s40 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s40, s43, 16 +; VI-NEXT: s_or_b32 s6, s6, s40 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s40, s42, 16 +; VI-NEXT: s_or_b32 s7, s7, s40 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s15 +; VI-NEXT: v_mov_b32_e32 v17, s14 +; VI-NEXT: v_mov_b32_e32 v18, s13 +; VI-NEXT: v_mov_b32_e32 v19, s12 +; VI-NEXT: v_mov_b32_e32 v20, s11 +; VI-NEXT: v_mov_b32_e32 v21, s10 +; VI-NEXT: v_mov_b32_e32 v22, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: v_mov_b32_e32 v25, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v26i32_to_v52f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s42, s41, 16 +; GFX9-NEXT: s_lshr_b32 s43, s40, 16 +; GFX9-NEXT: s_lshr_b32 s44, s15, 16 +; GFX9-NEXT: s_lshr_b32 s45, s14, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_lshr_b32 s47, s12, 16 +; GFX9-NEXT: s_lshr_b32 s56, s11, 16 +; GFX9-NEXT: s_lshr_b32 s57, s10, 16 +; GFX9-NEXT: s_lshr_b32 s58, s9, 16 +; GFX9-NEXT: s_lshr_b32 s59, s8, 16 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s6, 16 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s26, 16 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s24, 16 +; GFX9-NEXT: s_lshr_b32 s76, s23, 16 +; GFX9-NEXT: s_lshr_b32 s77, s22, 16 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_lshr_b32 s88, s19, 16 +; GFX9-NEXT: s_lshr_b32 s89, s18, 16 +; GFX9-NEXT: s_lshr_b32 s90, s17, 16 +; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s41, s41, 3 +; GFX9-NEXT: s_add_i32 s40, s40, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s42, s41, 16 +; GFX9-NEXT: s_lshr_b32 s43, s40, 16 +; GFX9-NEXT: s_lshr_b32 s44, s15, 16 +; GFX9-NEXT: s_lshr_b32 s45, s14, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_lshr_b32 s47, s12, 16 +; GFX9-NEXT: s_lshr_b32 s56, s11, 16 +; GFX9-NEXT: s_lshr_b32 s57, s10, 16 +; GFX9-NEXT: s_lshr_b32 s58, s9, 16 +; GFX9-NEXT: s_lshr_b32 s59, s8, 16 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s6, 16 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s26, 16 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s24, 16 +; GFX9-NEXT: s_lshr_b32 s76, s23, 16 +; GFX9-NEXT: s_lshr_b32 s77, s22, 16 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_lshr_b32 s88, s19, 16 +; GFX9-NEXT: s_lshr_b32 s89, s18, 16 +; GFX9-NEXT: s_lshr_b32 s90, s17, 16 +; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s42 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-TRUE16-LABEL: bitcast_v26i32_to_v52f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v7 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: .LBB17_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s11 :: v_dual_mov_b32 v19, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s9 :: v_dual_mov_b32 v21, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s7 :: v_dual_mov_b32 v23, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v25, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB17_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB17_2 +; +; GFX11-FAKE16-LABEL: bitcast_v26i32_to_v52f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: s_mov_b32 s78, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-FAKE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-FAKE16-NEXT: .LBB17_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v25, s10 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB17_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <26 x i32> %a, splat (i32 3) + %a2 = bitcast <26 x i32> %a1 to <52 x half> + br label %end + +cmp.false: + %a3 = bitcast <26 x i32> %a to <52 x half> + br label %end + +end: + %phi = phi <52 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x half> %phi +} + define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v52f16_to_v26i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v33 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v44 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_or_b32_e32 v1, v40, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; GCN-NEXT: v_or_b32_e32 v2, v54, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; GCN-NEXT: v_or_b32_e32 v3, v52, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; GCN-NEXT: v_or_b32_e32 v4, v50, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v45 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v26, v12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v15 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v26, v16 -; GCN-NEXT: v_or_b32_e32 v17, v48, v17 -; GCN-NEXT: v_or_b32_e32 v18, v38, v18 -; GCN-NEXT: v_or_b32_e32 v19, v36, v19 -; GCN-NEXT: v_or_b32_e32 v20, v34, v20 -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_or_b32_e32 v22, v33, v22 -; GCN-NEXT: v_or_b32_e32 v23, v35, v23 -; GCN-NEXT: v_or_b32_e32 v24, v37, v24 -; GCN-NEXT: v_or_b32_e32 v25, v39, v25 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v40 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v54 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v52 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v49 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v48 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v38 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_or_b32_e32 v13, v15, v14 -; GCN-NEXT: v_or_b32_e32 v14, v17, v16 -; GCN-NEXT: v_or_b32_e32 v15, v19, v18 -; GCN-NEXT: v_or_b32_e32 v16, v21, v20 -; GCN-NEXT: v_or_b32_e32 v17, v23, v22 -; GCN-NEXT: v_or_b32_e32 v18, v25, v24 -; GCN-NEXT: v_or_b32_e32 v19, v27, v26 -; GCN-NEXT: v_or_b32_e32 v20, v29, v28 -; GCN-NEXT: v_or_b32_e32 v21, v31, v30 -; GCN-NEXT: v_or_b32_e32 v22, v33, v32 -; GCN-NEXT: v_or_b32_e32 v23, v35, v34 -; GCN-NEXT: v_or_b32_e32 v24, v37, v36 -; GCN-NEXT: v_or_b32_e32 v25, v39, v38 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v52f16_to_v26i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v50, v4 +; SI-NEXT: v_or_b32_e32 v21, v56, v21 +; SI-NEXT: v_or_b32_e32 v22, v46, v22 +; SI-NEXT: v_or_b32_e32 v23, v44, v23 +; SI-NEXT: v_or_b32_e32 v24, v34, v24 +; SI-NEXT: v_or_b32_e32 v25, v32, v25 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v59 +; SI-NEXT: v_or_b32_e32 v20, v58, v20 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v59 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v45 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52f16_to_v26i32: ; VI: ; %bb.0: @@ -4779,7 +9765,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v25, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -4860,9 +9846,9 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB9_2: ; %Flow +; VI-NEXT: .LBB18_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_4 +; VI-NEXT: s_cbranch_execz .LBB18_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v25, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v57, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -4943,7 +9929,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v26, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: .LBB9_4: ; %end +; VI-NEXT: .LBB18_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5050,7 +10036,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -5174,9 +10160,9 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; kill: killed $vgpr26 -; GFX9-NEXT: .LBB9_2: ; %Flow +; GFX9-NEXT: .LBB18_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -5269,7 +10255,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_4: ; %end +; GFX9-NEXT: .LBB18_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5298,7 +10284,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -5326,7 +10312,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: .LBB18_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5390,7 +10376,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -5418,7 +10404,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: .LBB18_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5438,45 +10424,1265 @@ end: ret <26 x i32> %phi } +define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52f16_to_v26i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_or_b32_e32 v7, v45, v7 +; SI-NEXT: v_or_b32_e32 v8, v40, v8 +; SI-NEXT: v_or_b32_e32 v9, v55, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v47, v11 +; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_or_b32_e32 v13, v52, v13 +; SI-NEXT: v_or_b32_e32 v14, v63, v14 +; SI-NEXT: v_or_b32_e32 v15, v61, v15 +; SI-NEXT: v_or_b32_e32 v17, v35, v17 +; SI-NEXT: v_or_b32_e32 v18, v33, v18 +; SI-NEXT: v_or_b32_e32 v19, v59, v19 +; SI-NEXT: v_or_b32_e32 v20, v27, v20 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v37, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v38, v25 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v50, v63 +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v30 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v32, v59 +; SI-NEXT: v_mov_b32_e32 v59, v31 +; SI-NEXT: v_mov_b32_e32 v48, v61 +; SI-NEXT: v_mov_b32_e32 v61, v26 +; SI-NEXT: v_mov_b32_e32 v49, v62 +; SI-NEXT: v_mov_b32_e32 v62, v27 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v27, v62 +; SI-NEXT: v_mov_b32_e32 v62, v49 +; SI-NEXT: v_mov_b32_e32 v26, v61 +; SI-NEXT: v_mov_b32_e32 v61, v48 +; SI-NEXT: v_mov_b32_e32 v31, v59 +; SI-NEXT: v_mov_b32_e32 v59, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v30, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v63, v50 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v52f16_to_v26i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v32, v11 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v6 +; VI-NEXT: v_mov_b32_e32 v38, v5 +; VI-NEXT: v_mov_b32_e32 v39, v4 +; VI-NEXT: v_mov_b32_e32 v48, v3 +; VI-NEXT: v_mov_b32_e32 v49, v2 +; VI-NEXT: v_mov_b32_e32 v50, v1 +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v25, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v51, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v50, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v49, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v39, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v38, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v37, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v36, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v35, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v34, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v33, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v24, v26, v24 +; VI-NEXT: v_add_f16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v52f16_to_v26i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v11 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v34, v9 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v36, v7 +; GFX9-NEXT: v_mov_b32_e32 v37, v6 +; GFX9-NEXT: v_mov_b32_e32 v38, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v4 +; GFX9-NEXT: v_mov_b32_e32 v48, v3 +; GFX9-NEXT: v_mov_b32_e32 v49, v2 +; GFX9-NEXT: v_mov_b32_e32 v50, v1 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v25 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v26i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v7 :: v_dual_mov_b32 v33, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v3 :: v_dual_mov_b32 v37, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v39, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v39 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v16, 16, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB19_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB19_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB19_2 +; +; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v26i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB19_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB19_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB19_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <52 x half> %a, splat (half 0xH0200) + %a2 = bitcast <52 x half> %a1 to <26 x i32> + br label %end + +cmp.false: + %a3 = bitcast <52 x half> %a to <26 x i32> + br label %end + +end: + %phi = phi <26 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x i32> %phi +} + define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v26f32_to_v13i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v26f32_to_v13i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26f32_to_v13i64: ; VI: ; %bb.0: @@ -5485,7 +11691,7 @@ define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -5513,7 +11719,7 @@ define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5524,7 +11730,7 @@ define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 ; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -5552,7 +11758,7 @@ define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5564,7 +11770,7 @@ define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 ; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 @@ -5579,7 +11785,7 @@ define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5599,67 +11805,326 @@ end: ret <13 x i64> %phi } -define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v13i64_to_v26f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26f32_to_v13i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: s_branch .LBB21_2 ; -; VI-LABEL: bitcast_v13i64_to_v26f32: +; VI-LABEL: bitcast_v26f32_to_v13i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v26f32_to_v13i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v26f32_to_v13i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB21_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: .LBB21_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <26 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <26 x float> %a1 to <13 x i64> + br label %end + +cmp.false: + %a3 = bitcast <26 x float> %a to <13 x i64> + br label %end + +end: + %phi = phi <13 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x i64> %phi +} + +define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v13i64_to_v26f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v13i64_to_v26f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 @@ -5674,7 +12139,7 @@ define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5685,7 +12150,7 @@ define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc @@ -5713,7 +12178,7 @@ define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5725,7 +12190,7 @@ define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -5760,7 +12225,7 @@ define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5780,45 +12245,324 @@ end: ret <26 x float> %phi } +define inreg <26 x float> @bitcast_v13i64_to_v26f32_scalar(<13 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13i64_to_v26f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v13i64_to_v26f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v13i64_to_v26f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v13i64_to_v26f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB23_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: .LBB23_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <13 x i64> %a, splat (i64 3) + %a2 = bitcast <13 x i64> %a1 to <26 x float> + br label %end + +cmp.false: + %a3 = bitcast <13 x i64> %a to <26 x float> + br label %end + +end: + %phi = phi <26 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x float> %phi +} + define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v26f32_to_v13f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v26f32_to_v13f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26f32_to_v13f64: ; VI: ; %bb.0: @@ -5827,7 +12571,7 @@ define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -5855,7 +12599,7 @@ define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end +; VI-NEXT: .LBB24_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5866,7 +12610,7 @@ define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 ; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -5894,7 +12638,7 @@ define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: .LBB24_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5906,7 +12650,7 @@ define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 ; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 @@ -5921,7 +12665,7 @@ define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB12_2: ; %end +; GFX11-NEXT: .LBB24_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5941,56 +12685,315 @@ end: ret <13 x double> %phi } -define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v13f64_to_v26f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26f32_to_v13f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: s_branch .LBB25_2 ; -; VI-LABEL: bitcast_v13f64_to_v26f32: +; VI-LABEL: bitcast_v26f32_to_v13f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v26f32_to_v13f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-LABEL: bitcast_v26f32_to_v13f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB25_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <26 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <26 x float> %a1 to <13 x double> + br label %end + +cmp.false: + %a3 = bitcast <26 x float> %a to <13 x double> + br label %end + +end: + %phi = phi <13 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x double> %phi +} + +define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v13f64_to_v26f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v13f64_to_v26f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6001,7 +13004,7 @@ define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -6016,7 +13019,7 @@ define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6028,7 +13031,7 @@ define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -6043,7 +13046,7 @@ define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6063,287 +13066,522 @@ end: ret <26 x float> %phi } +define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13f64_to_v26f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v13f64_to_v26f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v13f64_to_v26f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v13f64_to_v26f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB27_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: .LBB27_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <13 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <13 x double> %a1 to <26 x float> + br label %end + +cmp.false: + %a3 = bitcast <13 x double> %a to <26 x float> + br label %end + +end: + %phi = phi <26 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x float> %phi +} + define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v26f32_to_v52i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v32, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v34, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v38, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v50, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v32, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v34, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v38, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v50, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_and_b32_e32 v47, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v40, v45, v40 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v44, v46, v44 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v54, v47, v54 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v43, v56, v43 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v52, v57, v52 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v7, v7, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v8, v8, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v9, v9, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v10, v10, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v11, v11, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v12, v12, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v13, v13, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v14, v14, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v15, v15, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v16, v16, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v17, v17, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x64, v0 -; GCN-NEXT: v_or_b32_e32 v19, v19, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v37 -; GCN-NEXT: v_or_b32_e32 v21, v21, v29 -; GCN-NEXT: v_or_b32_e32 v22, v22, v35 -; GCN-NEXT: v_or_b32_e32 v23, v23, v28 -; GCN-NEXT: v_or_b32_e32 v24, v24, v33 -; GCN-NEXT: v_or_b32_e32 v25, v25, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v31 -; GCN-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v26f32_to_v52i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v48, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v50, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v48, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v50, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26f32_to_v52i16: ; VI: ; %bb.0: @@ -6381,7 +13619,7 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -6409,9 +13647,9 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB14_2: ; %Flow +; VI-NEXT: .LBB28_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_4 +; VI-NEXT: s_cbranch_execz .LBB28_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -6465,7 +13703,7 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB14_4: ; %end +; VI-NEXT: .LBB28_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 @@ -6562,7 +13800,7 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -6590,9 +13828,9 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB14_2: ; %Flow +; GFX9-NEXT: .LBB28_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: s_cbranch_execz .LBB28_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 ; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -6646,7 +13884,7 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB14_4: ; %end +; GFX9-NEXT: .LBB28_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v43, v0, s4 @@ -6690,7 +13928,7 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 @@ -6705,7 +13943,7 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6742,7 +13980,7 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -6770,9 +14008,9 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB28_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 @@ -6813,7 +14051,7 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: .LBB28_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 @@ -6860,523 +14098,1587 @@ end: ret <52 x i16> %phi } -define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v52i16_to_v26f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v62 -; GCN-NEXT: v_or_b32_e32 v1, v1, v63 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v59 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v58 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v57 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v56 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v47 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v46 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v44 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v43 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v11, v11, v42 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v60 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v45 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v26 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: .LBB15_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v45 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v0, v62, v0 -; GCN-NEXT: v_or_b32_e32 v1, v63, v1 -; GCN-NEXT: v_or_b32_e32 v2, v61, v2 -; GCN-NEXT: v_or_b32_e32 v3, v59, v3 -; GCN-NEXT: v_or_b32_e32 v4, v58, v4 -; GCN-NEXT: v_or_b32_e32 v5, v57, v5 -; GCN-NEXT: v_or_b32_e32 v6, v56, v6 -; GCN-NEXT: v_or_b32_e32 v7, v47, v7 -; GCN-NEXT: v_or_b32_e32 v8, v46, v8 -; GCN-NEXT: v_or_b32_e32 v9, v44, v9 -; GCN-NEXT: v_or_b32_e32 v10, v43, v10 -; GCN-NEXT: v_or_b32_e32 v11, v42, v11 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v26, v12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v15 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v26, v16 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v26, v17 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v26, v18 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v26, v19 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v26, v20 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v26, v21 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v26, v22 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v26, v23 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v26, v25 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 -; GCN-NEXT: .LBB15_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26f32_to_v52i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v28, s16 +; SI-NEXT: v_mov_b32_e32 v27, s17 +; SI-NEXT: v_mov_b32_e32 v25, s18 +; SI-NEXT: v_mov_b32_e32 v24, s19 +; SI-NEXT: v_mov_b32_e32 v21, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v20, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v29, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v17, v18, 16 +; SI-NEXT: v_alignbit_b32 v50, v20, v22, 16 +; SI-NEXT: v_alignbit_b32 v52, v19, v21, 16 +; SI-NEXT: v_alignbit_b32 v54, v24, v25, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v27, v28, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v19 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v27 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v29, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v17, v18, 16 +; SI-NEXT: v_alignbit_b32 v50, v20, v22, 16 +; SI-NEXT: v_alignbit_b32 v52, v19, v21, 16 +; SI-NEXT: v_alignbit_b32 v54, v24, v25, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v27, v28, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v19 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v27 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v28, v28, v40 +; SI-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v44 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v54 +; SI-NEXT: v_or_b32_e32 v25, v25, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v43 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 +; SI-NEXT: v_or_b32_e32 v21, v21, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v21, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v42 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v50 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v41 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v48 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_branch .LBB29_2 ; -; VI-LABEL: bitcast_v52i16_to_v26f32: +; VI-LABEL: bitcast_v26f32_to_v52i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v25 -; VI-NEXT: v_mov_b32_e32 v33, v24 -; VI-NEXT: v_mov_b32_e32 v34, v23 -; VI-NEXT: v_mov_b32_e32 v35, v22 -; VI-NEXT: v_mov_b32_e32 v36, v21 -; VI-NEXT: v_mov_b32_e32 v37, v20 -; VI-NEXT: v_mov_b32_e32 v38, v19 -; VI-NEXT: v_mov_b32_e32 v39, v18 -; VI-NEXT: v_mov_b32_e32 v48, v17 -; VI-NEXT: v_mov_b32_e32 v49, v16 -; VI-NEXT: v_mov_b32_e32 v50, v15 -; VI-NEXT: v_mov_b32_e32 v51, v14 -; VI-NEXT: v_mov_b32_e32 v52, v13 -; VI-NEXT: v_mov_b32_e32 v53, v12 -; VI-NEXT: v_mov_b32_e32 v54, v11 -; VI-NEXT: v_mov_b32_e32 v55, v10 -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_mov_b32_e32 v41, v8 -; VI-NEXT: v_mov_b32_e32 v42, v7 -; VI-NEXT: v_mov_b32_e32 v43, v6 -; VI-NEXT: v_mov_b32_e32 v44, v5 -; VI-NEXT: v_mov_b32_e32 v45, v4 -; VI-NEXT: v_mov_b32_e32 v46, v3 -; VI-NEXT: v_mov_b32_e32 v47, v2 -; VI-NEXT: v_mov_b32_e32 v56, v1 -; VI-NEXT: v_mov_b32_e32 v57, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v20, s17 +; VI-NEXT: v_mov_b32_e32 v18, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v14, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v24, s23 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v25, s24 +; VI-NEXT: v_mov_b32_e32 v23, s25 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v19, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB29_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v25, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v25, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v25, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v25, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v25, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v25, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v22, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v25, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v21, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v19, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v36 +; VI-NEXT: v_mov_b32_e32 v1, v37 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v30 +; VI-NEXT: v_mov_b32_e32 v7, v31 +; VI-NEXT: v_mov_b32_e32 v8, v32 +; VI-NEXT: v_mov_b32_e32 v9, v33 +; VI-NEXT: v_mov_b32_e32 v10, v34 +; VI-NEXT: v_mov_b32_e32 v11, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: s_branch .LBB29_2 +; +; GFX9-LABEL: bitcast_v26f32_to_v52i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v22, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v18, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v14, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: v_mov_b32_e32 v24, s23 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v25, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s25 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v19, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v16, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v14, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v12, v43, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v42, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v15, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v16, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v36 +; GFX9-NEXT: v_mov_b32_e32 v1, v37 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v30 +; GFX9-NEXT: v_mov_b32_e32 v7, v31 +; GFX9-NEXT: v_mov_b32_e32 v8, v32 +; GFX9-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-NEXT: v_mov_b32_e32 v10, v34 +; GFX9-NEXT: v_mov_b32_e32 v11, v35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: s_branch .LBB29_2 +; +; GFX11-TRUE16-LABEL: bitcast_v26f32_to_v52i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: .LBB29_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v8, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v64, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v53, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v52, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v51, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v50, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v66, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v65, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB29_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB29_2 +; +; GFX11-FAKE16-LABEL: bitcast_v26f32_to_v52i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: .LBB29_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v8, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v64, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v53, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v52, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v65, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB29_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: s_branch .LBB29_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <26 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <26 x float> %a1 to <52 x i16> + br label %end + +cmp.false: + %a3 = bitcast <26 x float> %a to <52 x i16> + br label %end + +end: + %phi = phi <52 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x i16> %phi +} + +define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v52i16_to_v26f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v56 +; SI-NEXT: v_or_b32_e32 v5, v5, v47 +; SI-NEXT: v_or_b32_e32 v6, v6, v38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v46 +; SI-NEXT: v_or_b32_e32 v10, v10, v35 +; SI-NEXT: v_or_b32_e32 v11, v11, v45 +; SI-NEXT: v_or_b32_e32 v12, v12, v44 +; SI-NEXT: v_or_b32_e32 v13, v13, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v43 +; SI-NEXT: v_or_b32_e32 v15, v15, v42 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_or_b32_e32 v17, v17, v41 +; SI-NEXT: v_or_b32_e32 v18, v18, v40 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v20, v20, v63 +; SI-NEXT: v_or_b32_e32 v21, v21, v62 +; SI-NEXT: v_or_b32_e32 v22, v22, v61 +; SI-NEXT: v_or_b32_e32 v23, v23, v60 +; SI-NEXT: v_or_b32_e32 v24, v24, v59 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: .LBB30_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_or_b32_e32 v5, v47, v5 +; SI-NEXT: v_or_b32_e32 v6, v38, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v36, v8 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v10, v35, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v44, v12 +; SI-NEXT: v_or_b32_e32 v13, v34, v13 +; SI-NEXT: v_or_b32_e32 v14, v43, v14 +; SI-NEXT: v_or_b32_e32 v15, v42, v15 +; SI-NEXT: v_or_b32_e32 v16, v33, v16 +; SI-NEXT: v_or_b32_e32 v17, v41, v17 +; SI-NEXT: v_or_b32_e32 v18, v40, v18 +; SI-NEXT: v_or_b32_e32 v19, v32, v19 +; SI-NEXT: v_or_b32_e32 v20, v63, v20 +; SI-NEXT: v_or_b32_e32 v21, v62, v21 +; SI-NEXT: v_or_b32_e32 v22, v61, v22 +; SI-NEXT: v_or_b32_e32 v23, v60, v23 +; SI-NEXT: v_or_b32_e32 v24, v59, v24 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 +; SI-NEXT: .LBB30_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v52i16_to_v26f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v25 +; VI-NEXT: v_mov_b32_e32 v33, v24 +; VI-NEXT: v_mov_b32_e32 v34, v23 +; VI-NEXT: v_mov_b32_e32 v35, v22 +; VI-NEXT: v_mov_b32_e32 v36, v21 +; VI-NEXT: v_mov_b32_e32 v37, v20 +; VI-NEXT: v_mov_b32_e32 v38, v19 +; VI-NEXT: v_mov_b32_e32 v39, v18 +; VI-NEXT: v_mov_b32_e32 v48, v17 +; VI-NEXT: v_mov_b32_e32 v49, v16 +; VI-NEXT: v_mov_b32_e32 v50, v15 +; VI-NEXT: v_mov_b32_e32 v51, v14 +; VI-NEXT: v_mov_b32_e32 v52, v13 +; VI-NEXT: v_mov_b32_e32 v53, v12 +; VI-NEXT: v_mov_b32_e32 v54, v11 +; VI-NEXT: v_mov_b32_e32 v55, v10 +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_mov_b32_e32 v41, v8 +; VI-NEXT: v_mov_b32_e32 v42, v7 +; VI-NEXT: v_mov_b32_e32 v43, v6 +; VI-NEXT: v_mov_b32_e32 v44, v5 +; VI-NEXT: v_mov_b32_e32 v45, v4 +; VI-NEXT: v_mov_b32_e32 v46, v3 +; VI-NEXT: v_mov_b32_e32 v47, v2 +; VI-NEXT: v_mov_b32_e32 v56, v1 +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB30_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v25, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v25, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v25, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v25, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v25, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v25, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v6, v25, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v7, v25, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v8, v25, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -7449,9 +15751,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB15_2: ; %Flow +; VI-NEXT: .LBB30_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_4 +; VI-NEXT: s_cbranch_execz .LBB30_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v25, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v57 @@ -7532,7 +15834,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v26, 3, v32 ; VI-NEXT: v_add_u16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: .LBB15_4: ; %end +; VI-NEXT: .LBB30_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -7639,7 +15941,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -7763,9 +16065,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; kill: killed $vgpr26 -; GFX9-NEXT: .LBB15_2: ; %Flow +; GFX9-NEXT: .LBB30_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -7857,7 +16159,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_4: ; %end +; GFX9-NEXT: .LBB30_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -7886,7 +16188,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -7914,7 +16216,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB15_2: ; %end +; GFX11-TRUE16-NEXT: .LBB30_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7978,7 +16280,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -8006,7 +16308,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB15_2: ; %end +; GFX11-FAKE16-NEXT: .LBB30_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8026,606 +16328,1710 @@ end: ret <26 x float> %phi } -define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v26f32_to_v52f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v51 -; GCN-NEXT: v_mov_b32_e32 v51, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v53 -; GCN-NEXT: v_mov_b32_e32 v53, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v55 -; GCN-NEXT: v_mov_b32_e32 v55, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_mov_b32_e32 v41, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v52 -; GCN-NEXT: v_mov_b32_e32 v52, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v54 -; GCN-NEXT: v_mov_b32_e32 v54, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v43 -; GCN-NEXT: v_mov_b32_e32 v43, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v45 -; GCN-NEXT: v_mov_b32_e32 v45, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v47 -; GCN-NEXT: v_mov_b32_e32 v47, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: .LBB16_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: .LBB16_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v28 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v27 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v62 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v60 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v58 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v32 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v31 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v29 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v63 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v61 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v59 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v56 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v55 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v53 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v51 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v41 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_or_b32_e32 v53, v55, v54 -; GCN-NEXT: v_or_b32_e32 v54, v41, v40 -; GCN-NEXT: v_or_b32_e32 v55, v43, v42 -; GCN-NEXT: v_or_b32_e32 v40, v45, v44 -; GCN-NEXT: v_or_b32_e32 v41, v47, v46 -; GCN-NEXT: v_or_b32_e32 v42, v57, v56 -; GCN-NEXT: v_or_b32_e32 v43, v59, v58 -; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52i16_to_v26f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: v_mov_b32_e32 v33, v24 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v24, v0, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v62 +; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v57 +; SI-NEXT: v_mov_b32_e32 v57, v32 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v50, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v40 +; SI-NEXT: v_mov_b32_e32 v40, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v49, v38 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v43 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v27 +; SI-NEXT: v_mov_b32_e32 v44, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_mov_b32_e32 v43, v62 +; SI-NEXT: v_mov_b32_e32 v62, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v31, v62 +; SI-NEXT: v_mov_b32_e32 v62, v43 +; SI-NEXT: v_mov_b32_e32 v29, v60 +; SI-NEXT: v_mov_b32_e32 v60, v44 +; SI-NEXT: v_mov_b32_e32 v27, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v43, v46 +; SI-NEXT: v_mov_b32_e32 v44, v47 +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v26, v35 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v49 +; SI-NEXT: v_mov_b32_e32 v49, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v48 +; SI-NEXT: v_mov_b32_e32 v48, v40 +; SI-NEXT: v_mov_b32_e32 v40, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v33, v36 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v50 +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: v_mov_b32_e32 v57, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB31_2 ; -; VI-LABEL: bitcast_v26f32_to_v52f16: +; VI-LABEL: bitcast_v52i16_to_v26f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v39, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v35, v9 +; VI-NEXT: v_mov_b32_e32 v34, v8 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v38, v4 +; VI-NEXT: v_mov_b32_e32 v48, v3 +; VI-NEXT: v_mov_b32_e32 v49, v2 +; VI-NEXT: v_mov_b32_e32 v51, v1 +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: v_or_b32_sdwa v14, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_or_b32_sdwa v18, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v20, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v21, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v22, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v50 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v52i16_to_v26f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v11 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v34, v9 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v36, v7 +; GFX9-NEXT: v_mov_b32_e32 v37, v6 +; GFX9-NEXT: v_mov_b32_e32 v38, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v4 +; GFX9-NEXT: v_mov_b32_e32 v48, v3 +; GFX9-NEXT: v_mov_b32_e32 v49, v2 +; GFX9-NEXT: v_mov_b32_e32 v50, v1 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v25 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v26f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v7 :: v_dual_mov_b32 v33, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v3 :: v_dual_mov_b32 v37, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v39, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v39 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v16, 16, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB31_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB31_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB31_2 +; +; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v26f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB31_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB31_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB31_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <52 x i16> %a, splat (i16 3) + %a2 = bitcast <52 x i16> %a1 to <26 x float> + br label %end + +cmp.false: + %a3 = bitcast <52 x i16> %a to <26 x float> + br label %end + +end: + %phi = phi <26 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x float> %phi +} + +define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v26f32_to_v52f16: +; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v27 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_mov_b32_e32 v55, v24 +; SI-NEXT: v_mov_b32_e32 v53, v25 +; SI-NEXT: v_mov_b32_e32 v51, v26 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: .LBB32_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v26f32_to_v52f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr51 @@ -8648,7 +18054,7 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -8676,9 +18082,9 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB16_2: ; %Flow +; VI-NEXT: .LBB32_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_4 +; VI-NEXT: s_cbranch_execz .LBB32_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -8732,7 +18138,7 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB16_4: ; %end +; VI-NEXT: .LBB32_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 @@ -8829,7 +18235,7 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -8857,9 +18263,9 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB16_2: ; %Flow +; GFX9-NEXT: .LBB32_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: s_cbranch_execz .LBB32_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 ; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -8913,7 +18319,7 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB16_4: ; %end +; GFX9-NEXT: .LBB32_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v43, v0, s4 @@ -8957,7 +18363,7 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 @@ -8972,7 +18378,7 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: .LBB32_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9009,7 +18415,7 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -9037,9 +18443,9 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB32_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 @@ -9080,7 +18486,7 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_4: ; %end +; GFX11-FAKE16-NEXT: .LBB32_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 @@ -9127,738 +18533,677 @@ end: ret <52 x half> %phi } -define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v52f16_to_v26f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v33 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v44 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_or_b32_e32 v1, v40, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; GCN-NEXT: v_or_b32_e32 v2, v54, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; GCN-NEXT: v_or_b32_e32 v3, v52, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; GCN-NEXT: v_or_b32_e32 v4, v50, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v45 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v26, v12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v15 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v26, v16 -; GCN-NEXT: v_or_b32_e32 v17, v48, v17 -; GCN-NEXT: v_or_b32_e32 v18, v38, v18 -; GCN-NEXT: v_or_b32_e32 v19, v36, v19 -; GCN-NEXT: v_or_b32_e32 v20, v34, v20 -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_or_b32_e32 v22, v33, v22 -; GCN-NEXT: v_or_b32_e32 v23, v35, v23 -; GCN-NEXT: v_or_b32_e32 v24, v37, v24 -; GCN-NEXT: v_or_b32_e32 v25, v39, v25 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: .LBB17_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v40 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v54 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v52 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v49 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v48 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v38 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_or_b32_e32 v13, v15, v14 -; GCN-NEXT: v_or_b32_e32 v14, v17, v16 -; GCN-NEXT: v_or_b32_e32 v15, v19, v18 -; GCN-NEXT: v_or_b32_e32 v16, v21, v20 -; GCN-NEXT: v_or_b32_e32 v17, v23, v22 -; GCN-NEXT: v_or_b32_e32 v18, v25, v24 -; GCN-NEXT: v_or_b32_e32 v19, v27, v26 -; GCN-NEXT: v_or_b32_e32 v20, v29, v28 -; GCN-NEXT: v_or_b32_e32 v21, v31, v30 -; GCN-NEXT: v_or_b32_e32 v22, v33, v32 -; GCN-NEXT: v_or_b32_e32 v23, v35, v34 -; GCN-NEXT: v_or_b32_e32 v24, v37, v36 -; GCN-NEXT: v_or_b32_e32 v25, v39, v38 -; GCN-NEXT: .LBB17_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26f32_to_v52f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_readfirstlane_b32 s41, v1 +; SI-NEXT: v_readfirstlane_b32 s40, v2 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_readfirstlane_b32 s14, v4 +; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_readfirstlane_b32 s8, v9 +; SI-NEXT: v_readfirstlane_b32 s7, v10 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v12 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v27, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s41, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s40, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s10, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s9, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v19 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v17 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v13 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v42 +; SI-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v30, v41, v30 +; SI-NEXT: buffer_store_dword v30, v4, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v4, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 +; SI-NEXT: v_add_i32_e32 v55, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v53 +; SI-NEXT: v_add_i32_e32 v53, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v52 +; SI-NEXT: v_add_i32_e32 v51, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_add_i32_e32 v49, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v11 +; SI-NEXT: v_add_i32_e32 v39, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_add_i32_e32 v37, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v12 +; SI-NEXT: v_add_i32_e32 v35, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 +; SI-NEXT: v_add_i32_e32 v33, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 +; SI-NEXT: v_add_i32_e32 v31, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_i32_e32 v28, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 +; SI-NEXT: v_add_i32_e32 v26, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 +; SI-NEXT: v_add_i32_e32 v24, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 +; SI-NEXT: v_add_i32_e32 v22, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 +; SI-NEXT: v_add_i32_e32 v20, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 +; SI-NEXT: v_add_i32_e32 v18, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v36 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB33_2 ; -; VI-LABEL: bitcast_v52f16_to_v26f32: +; VI-LABEL: bitcast_v26f32_to_v52f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v25 -; VI-NEXT: v_mov_b32_e32 v33, v24 -; VI-NEXT: v_mov_b32_e32 v34, v23 -; VI-NEXT: v_mov_b32_e32 v35, v22 -; VI-NEXT: v_mov_b32_e32 v36, v21 -; VI-NEXT: v_mov_b32_e32 v37, v20 -; VI-NEXT: v_mov_b32_e32 v38, v19 -; VI-NEXT: v_mov_b32_e32 v39, v18 -; VI-NEXT: v_mov_b32_e32 v48, v17 -; VI-NEXT: v_mov_b32_e32 v49, v16 -; VI-NEXT: v_mov_b32_e32 v50, v15 -; VI-NEXT: v_mov_b32_e32 v51, v14 -; VI-NEXT: v_mov_b32_e32 v52, v13 -; VI-NEXT: v_mov_b32_e32 v53, v12 -; VI-NEXT: v_mov_b32_e32 v54, v11 -; VI-NEXT: v_mov_b32_e32 v55, v10 -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_mov_b32_e32 v41, v8 -; VI-NEXT: v_mov_b32_e32 v42, v7 -; VI-NEXT: v_mov_b32_e32 v43, v6 -; VI-NEXT: v_mov_b32_e32 v44, v5 -; VI-NEXT: v_mov_b32_e32 v45, v4 -; VI-NEXT: v_mov_b32_e32 v46, v3 -; VI-NEXT: v_mov_b32_e32 v47, v2 -; VI-NEXT: v_mov_b32_e32 v56, v1 -; VI-NEXT: v_mov_b32_e32 v57, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v20, s17 +; VI-NEXT: v_mov_b32_e32 v18, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v14, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v24, s23 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v25, s24 +; VI-NEXT: v_mov_b32_e32 v23, s25 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v19, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB33_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v25, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v25, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v25, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v25, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v25, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v25, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v25, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v25, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v25, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v25, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v25, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v25, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v25, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v25, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v25, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v25, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v25, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v25, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v25, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v25, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v25, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v25, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v25, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v45, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v44, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v43, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v42, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v41, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v40, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v55, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v54, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v53, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v52, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v51, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v50, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v49, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v48, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v39, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v38, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v37, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v36, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v35, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v34, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v33, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v32, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v22, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v25, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v21, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v19, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v36 +; VI-NEXT: v_mov_b32_e32 v1, v37 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v30 +; VI-NEXT: v_mov_b32_e32 v7, v31 +; VI-NEXT: v_mov_b32_e32 v8, v32 +; VI-NEXT: v_mov_b32_e32 v9, v33 +; VI-NEXT: v_mov_b32_e32 v10, v34 +; VI-NEXT: v_mov_b32_e32 v11, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr41 @@ -9873,20 +19218,1337 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB17_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v25, 0x200 -; VI-NEXT: v_add_f16_sdwa v0, v57, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v57 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_branch .LBB33_2 +; +; GFX9-LABEL: bitcast_v26f32_to_v52f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v22, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v18, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v14, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: v_mov_b32_e32 v24, s23 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v25, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s25 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v19, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v16, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v14, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v12, v43, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v42, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v15, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v16, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v36 +; GFX9-NEXT: v_mov_b32_e32 v1, v37 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v30 +; GFX9-NEXT: v_mov_b32_e32 v7, v31 +; GFX9-NEXT: v_mov_b32_e32 v8, v32 +; GFX9-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-NEXT: v_mov_b32_e32 v10, v34 +; GFX9-NEXT: v_mov_b32_e32 v11, v35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: s_branch .LBB33_2 +; +; GFX11-TRUE16-LABEL: bitcast_v26f32_to_v52f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: .LBB33_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v8, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v64, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v53, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v52, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v51, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v50, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v66, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v65, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB33_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB33_2 +; +; GFX11-FAKE16-LABEL: bitcast_v26f32_to_v52f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: .LBB33_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v8, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v64, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v53, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v52, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v65, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB33_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: s_branch .LBB33_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <26 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <26 x float> %a1 to <52 x half> + br label %end + +cmp.false: + %a3 = bitcast <26 x float> %a to <52 x half> + br label %end + +end: + %phi = phi <52 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x half> %phi +} + +define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v52f16_to_v26f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v50, v4 +; SI-NEXT: v_or_b32_e32 v21, v56, v21 +; SI-NEXT: v_or_b32_e32 v22, v46, v22 +; SI-NEXT: v_or_b32_e32 v23, v44, v23 +; SI-NEXT: v_or_b32_e32 v24, v34, v24 +; SI-NEXT: v_or_b32_e32 v25, v32, v25 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v59 +; SI-NEXT: v_or_b32_e32 v20, v58, v20 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v59 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v45 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v52f16_to_v26f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v25 +; VI-NEXT: v_mov_b32_e32 v33, v24 +; VI-NEXT: v_mov_b32_e32 v34, v23 +; VI-NEXT: v_mov_b32_e32 v35, v22 +; VI-NEXT: v_mov_b32_e32 v36, v21 +; VI-NEXT: v_mov_b32_e32 v37, v20 +; VI-NEXT: v_mov_b32_e32 v38, v19 +; VI-NEXT: v_mov_b32_e32 v39, v18 +; VI-NEXT: v_mov_b32_e32 v48, v17 +; VI-NEXT: v_mov_b32_e32 v49, v16 +; VI-NEXT: v_mov_b32_e32 v50, v15 +; VI-NEXT: v_mov_b32_e32 v51, v14 +; VI-NEXT: v_mov_b32_e32 v52, v13 +; VI-NEXT: v_mov_b32_e32 v53, v12 +; VI-NEXT: v_mov_b32_e32 v54, v11 +; VI-NEXT: v_mov_b32_e32 v55, v10 +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_mov_b32_e32 v41, v8 +; VI-NEXT: v_mov_b32_e32 v42, v7 +; VI-NEXT: v_mov_b32_e32 v43, v6 +; VI-NEXT: v_mov_b32_e32 v44, v5 +; VI-NEXT: v_mov_b32_e32 v45, v4 +; VI-NEXT: v_mov_b32_e32 v46, v3 +; VI-NEXT: v_mov_b32_e32 v47, v2 +; VI-NEXT: v_mov_b32_e32 v56, v1 +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v25, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v25, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v25, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v25, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v25, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v25, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v25, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v25, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v25, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v25, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v25, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v25, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v25, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v25, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v25, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v25, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v25, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v25, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v25, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v25, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v25, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v25, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v25, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v45, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v44, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v43, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v42, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v41, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v40, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v55, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v54, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v53, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v52, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v51, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v50, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v49, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v48, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v39, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v38, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v37, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v36, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v35, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v34, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: .LBB34_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v25, 0x200 +; VI-NEXT: v_add_f16_sdwa v0, v57, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v57 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_add_f16_sdwa v1, v56, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v2, 0x200, v56 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -9962,7 +20624,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v26, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: .LBB17_4: ; %end +; VI-NEXT: .LBB34_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -10069,7 +20731,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -10193,9 +20855,9 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; kill: killed $vgpr26 -; GFX9-NEXT: .LBB17_2: ; %Flow +; GFX9-NEXT: .LBB34_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_4 +; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -10288,7 +20950,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_4: ; %end +; GFX9-NEXT: .LBB34_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -10317,7 +20979,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -10345,7 +21007,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10409,7 +21071,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -10437,9 +21099,1229 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB17_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <52 x half> %a, splat (half 0xH0200) + %a2 = bitcast <52 x half> %a1 to <26 x float> + br label %end + +cmp.false: + %a3 = bitcast <52 x half> %a to <26 x float> + br label %end + +end: + %phi = phi <26 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x float> %phi +} + +define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52f16_to_v26f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_or_b32_e32 v7, v45, v7 +; SI-NEXT: v_or_b32_e32 v8, v40, v8 +; SI-NEXT: v_or_b32_e32 v9, v55, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v47, v11 +; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_or_b32_e32 v13, v52, v13 +; SI-NEXT: v_or_b32_e32 v14, v63, v14 +; SI-NEXT: v_or_b32_e32 v15, v61, v15 +; SI-NEXT: v_or_b32_e32 v17, v35, v17 +; SI-NEXT: v_or_b32_e32 v18, v33, v18 +; SI-NEXT: v_or_b32_e32 v19, v59, v19 +; SI-NEXT: v_or_b32_e32 v20, v27, v20 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v37, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v38, v25 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v50, v63 +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v30 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v32, v59 +; SI-NEXT: v_mov_b32_e32 v59, v31 +; SI-NEXT: v_mov_b32_e32 v48, v61 +; SI-NEXT: v_mov_b32_e32 v61, v26 +; SI-NEXT: v_mov_b32_e32 v49, v62 +; SI-NEXT: v_mov_b32_e32 v62, v27 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v27, v62 +; SI-NEXT: v_mov_b32_e32 v62, v49 +; SI-NEXT: v_mov_b32_e32 v26, v61 +; SI-NEXT: v_mov_b32_e32 v61, v48 +; SI-NEXT: v_mov_b32_e32 v31, v59 +; SI-NEXT: v_mov_b32_e32 v59, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v30, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v63, v50 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v52f16_to_v26f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v32, v11 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v6 +; VI-NEXT: v_mov_b32_e32 v38, v5 +; VI-NEXT: v_mov_b32_e32 v39, v4 +; VI-NEXT: v_mov_b32_e32 v48, v3 +; VI-NEXT: v_mov_b32_e32 v49, v2 +; VI-NEXT: v_mov_b32_e32 v50, v1 +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v25, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v51, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v50, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v49, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v39, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v38, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v37, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v36, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v35, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v34, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v33, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v24, v26, v24 +; VI-NEXT: v_add_f16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v52f16_to_v26f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v11 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v34, v9 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v36, v7 +; GFX9-NEXT: v_mov_b32_e32 v37, v6 +; GFX9-NEXT: v_mov_b32_e32 v38, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v4 +; GFX9-NEXT: v_mov_b32_e32 v48, v3 +; GFX9-NEXT: v_mov_b32_e32 v49, v2 +; GFX9-NEXT: v_mov_b32_e32 v50, v1 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v25 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v26f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v7 :: v_dual_mov_b32 v33, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v3 :: v_dual_mov_b32 v37, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v39, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v39 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v16, 16, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB35_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB35_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; +; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v26f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB35_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB35_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB35_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -10458,44 +22340,44 @@ end: } define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v13i64_to_v13f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v13i64_to_v13f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v13i64_to_v13f64: ; VI: ; %bb.0: @@ -10504,7 +22386,7 @@ define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: s_cbranch_execz .LBB36_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -10532,7 +22414,7 @@ define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 ; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: .LBB18_2: ; %end +; VI-NEXT: .LBB36_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -10543,7 +22425,7 @@ define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: s_cbranch_execz .LBB36_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -10571,7 +22453,7 @@ define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc ; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: .LBB18_2: ; %end +; GFX9-NEXT: .LBB36_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10583,7 +22465,7 @@ define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -10618,62 +22500,516 @@ define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: .LBB18_2: ; %end +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <13 x i64> %a, splat (i64 3) + %a2 = bitcast <13 x i64> %a1 to <13 x double> + br label %end + +cmp.false: + %a3 = bitcast <13 x i64> %a to <13 x double> + br label %end + +end: + %phi = phi <13 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x double> %phi +} + +define inreg <13 x double> @bitcast_v13i64_to_v13f64_scalar(<13 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13i64_to_v13f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 +; +; VI-LABEL: bitcast_v13i64_to_v13f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: .LBB37_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 +; +; GFX9-LABEL: bitcast_v13i64_to_v13f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: .LBB37_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 +; +; GFX11-LABEL: bitcast_v13i64_to_v13f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB37_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: .LBB37_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <13 x i64> %a, splat (i64 3) + %a2 = bitcast <13 x i64> %a1 to <13 x double> + br label %end + +cmp.false: + %a3 = bitcast <13 x i64> %a to <13 x double> + br label %end + +end: + %phi = phi <13 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x double> %phi +} + +define <13 x i64> @bitcast_v13f64_to_v13i64(<13 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v13f64_to_v13i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v13f64_to_v13i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v13f64_to_v13i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v13f64_to_v13i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <13 x i64> %a, splat (i64 3) - %a2 = bitcast <13 x i64> %a1 to <13 x double> + %a1 = fadd <13 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <13 x double> %a1 to <13 x i64> br label %end cmp.false: - %a3 = bitcast <13 x i64> %a to <13 x double> + %a3 = bitcast <13 x double> %a to <13 x i64> br label %end end: - %phi = phi <13 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <13 x double> %phi + %phi = phi <13 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x i64> %phi } -define <13 x i64> @bitcast_v13f64_to_v13i64(<13 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v13f64_to_v13i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13f64_to_v13i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 ; -; VI-LABEL: bitcast_v13f64_to_v13i64: +; VI-LABEL: bitcast_v13f64_to_v13i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 @@ -10687,19 +23023,46 @@ define <13 x i64> @bitcast_v13f64_to_v13i64(<13 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: .LBB19_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB39_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 ; -; GFX9-LABEL: bitcast_v13f64_to_v13i64: +; GFX9-LABEL: bitcast_v13f64_to_v13i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 @@ -10713,20 +23076,40 @@ define <13 x i64> @bitcast_v13f64_to_v13i64(<13 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB39_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 ; -; GFX11-LABEL: bitcast_v13f64_to_v13i64: +; GFX11-LABEL: bitcast_v13f64_to_v13i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v26 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB39_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: .LBB39_4: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 @@ -10740,8 +23123,6 @@ define <13 x i64> @bitcast_v13f64_to_v13i64(<13 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -10761,286 +23142,301 @@ end: } define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v13i64_to_v52i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v35, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v38, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v50, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v35, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v38, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v50, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_and_b32_e32 v47, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v40, v45, v40 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v44, v46, v44 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v54, v47, v54 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v43, v56, v43 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v52, v57, v52 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v7, v7, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v8, v8, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v9, v9, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v10, v10, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v11, v11, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v12, v12, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v13, v13, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v14, v14, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v15, v15, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v16, v16, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v17, v17, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v18, v18, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x64, v0 -; GCN-NEXT: v_or_b32_e32 v19, v19, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v37 -; GCN-NEXT: v_or_b32_e32 v21, v21, v29 -; GCN-NEXT: v_or_b32_e32 v22, v22, v36 -; GCN-NEXT: v_or_b32_e32 v23, v23, v28 -; GCN-NEXT: v_or_b32_e32 v24, v24, v34 -; GCN-NEXT: v_or_b32_e32 v25, v25, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v32 -; GCN-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v13i64_to_v52i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v32, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v39, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v49, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v32, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v39, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v49, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v13i64_to_v52i16: ; VI: ; %bb.0: @@ -11078,7 +23474,7 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -11106,9 +23502,9 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB20_2: ; %Flow +; VI-NEXT: .LBB40_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_4 +; VI-NEXT: s_cbranch_execz .LBB40_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 ; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc @@ -11162,7 +23558,7 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB20_4: ; %end +; VI-NEXT: .LBB40_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 @@ -11259,7 +23655,7 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -11287,9 +23683,9 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB20_2: ; %Flow +; GFX9-NEXT: .LBB40_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_4 +; GFX9-NEXT: s_cbranch_execz .LBB40_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc @@ -11343,7 +23739,7 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB20_4: ; %end +; GFX9-NEXT: .LBB40_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v43, v0, s4 @@ -11387,7 +23783,7 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -11422,7 +23818,7 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: .LBB40_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11459,7 +23855,7 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -11487,9 +23883,9 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB40_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -11550,7 +23946,7 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: .LBB40_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 @@ -11597,470 +23993,1534 @@ end: ret <52 x i16> %phi } +define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13i64_to_v52i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_readfirstlane_b32 s41, v1 +; SI-NEXT: v_readfirstlane_b32 s40, v2 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_readfirstlane_b32 s14, v4 +; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_readfirstlane_b32 s9, v9 +; SI-NEXT: v_readfirstlane_b32 s8, v10 +; SI-NEXT: v_readfirstlane_b32 s7, v11 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s28 +; SI-NEXT: v_mov_b32_e32 v8, s26 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s29, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s27, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s25, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s23, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s21, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s17, v13, 16 +; SI-NEXT: s_lshr_b32 s42, s6, 16 +; SI-NEXT: s_lshr_b32 s43, s8, 16 +; SI-NEXT: s_lshr_b32 s44, s10, 16 +; SI-NEXT: s_lshr_b32 s45, s12, 16 +; SI-NEXT: s_lshr_b32 s46, s14, 16 +; SI-NEXT: s_lshr_b32 s47, s40, 16 +; SI-NEXT: s_lshr_b32 s56, s29, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_lshr_b32 s58, s25, 16 +; SI-NEXT: s_lshr_b32 s59, s23, 16 +; SI-NEXT: s_lshr_b32 s60, s21, 16 +; SI-NEXT: s_lshr_b32 s61, s19, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s41, s41, 3 +; SI-NEXT: s_addc_u32 s40, s40, 0 +; SI-NEXT: s_add_u32 s15, s15, 3 +; SI-NEXT: s_addc_u32 s14, s14, 0 +; SI-NEXT: s_add_u32 s13, s13, 3 +; SI-NEXT: s_addc_u32 s12, s12, 0 +; SI-NEXT: s_add_u32 s11, s11, 3 +; SI-NEXT: s_addc_u32 s10, s10, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s28 +; SI-NEXT: v_mov_b32_e32 v8, s26 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s29, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s27, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s25, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s23, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s21, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s17, v13, 16 +; SI-NEXT: s_lshr_b32 s42, s6, 16 +; SI-NEXT: s_lshr_b32 s43, s8, 16 +; SI-NEXT: s_lshr_b32 s44, s10, 16 +; SI-NEXT: s_lshr_b32 s45, s12, 16 +; SI-NEXT: s_lshr_b32 s46, s14, 16 +; SI-NEXT: s_lshr_b32 s47, s40, 16 +; SI-NEXT: s_lshr_b32 s56, s29, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_lshr_b32 s58, s25, 16 +; SI-NEXT: s_lshr_b32 s59, s23, 16 +; SI-NEXT: s_lshr_b32 s60, s21, 16 +; SI-NEXT: s_lshr_b32 s61, s19, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, s4, v13 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v13i64_to_v52i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_readfirstlane_b32 s41, v0 +; VI-NEXT: v_readfirstlane_b32 s40, v1 +; VI-NEXT: v_readfirstlane_b32 s15, v2 +; VI-NEXT: v_readfirstlane_b32 s14, v3 +; VI-NEXT: v_readfirstlane_b32 s13, v4 +; VI-NEXT: v_readfirstlane_b32 s12, v5 +; VI-NEXT: v_readfirstlane_b32 s11, v6 +; VI-NEXT: v_readfirstlane_b32 s10, v7 +; VI-NEXT: v_readfirstlane_b32 s9, v8 +; VI-NEXT: v_readfirstlane_b32 s8, v9 +; VI-NEXT: v_readfirstlane_b32 s6, v10 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v11 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s42, s7, 16 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s45, s9, 16 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: s_lshr_b32 s56, s12, 16 +; VI-NEXT: s_lshr_b32 s57, s13, 16 +; VI-NEXT: s_lshr_b32 s58, s14, 16 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s40, 16 +; VI-NEXT: s_lshr_b32 s61, s41, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 16 +; VI-NEXT: s_lshr_b32 s73, s26, 16 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s24, 16 +; VI-NEXT: s_lshr_b32 s76, s23, 16 +; VI-NEXT: s_lshr_b32 s77, s22, 16 +; VI-NEXT: s_lshr_b32 s78, s21, 16 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s19, 16 +; VI-NEXT: s_lshr_b32 s89, s18, 16 +; VI-NEXT: s_lshr_b32 s90, s17, 16 +; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s41, s41, 3 +; VI-NEXT: s_addc_u32 s40, s40, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s42, s7, 16 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s45, s9, 16 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: s_lshr_b32 s56, s12, 16 +; VI-NEXT: s_lshr_b32 s57, s13, 16 +; VI-NEXT: s_lshr_b32 s58, s14, 16 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s40, 16 +; VI-NEXT: s_lshr_b32 s61, s41, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 16 +; VI-NEXT: s_lshr_b32 s73, s26, 16 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s24, 16 +; VI-NEXT: s_lshr_b32 s76, s23, 16 +; VI-NEXT: s_lshr_b32 s77, s22, 16 +; VI-NEXT: s_lshr_b32 s78, s21, 16 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s19, 16 +; VI-NEXT: s_lshr_b32 s89, s18, 16 +; VI-NEXT: s_lshr_b32 s90, s17, 16 +; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s91, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s90, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s89, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s88, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s79, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s78, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s77, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s76, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s75, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s74, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s73, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s72, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s63, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s62, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s29, s61, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s40, s60, 16 +; VI-NEXT: s_or_b32 s29, s29, s40 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s40, s59, 16 +; VI-NEXT: s_or_b32 s15, s15, s40 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s40, s58, 16 +; VI-NEXT: s_or_b32 s14, s14, s40 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s40, s57, 16 +; VI-NEXT: s_or_b32 s13, s13, s40 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s40, s56, 16 +; VI-NEXT: s_or_b32 s12, s12, s40 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s40, s47, 16 +; VI-NEXT: s_or_b32 s11, s11, s40 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s40, s46, 16 +; VI-NEXT: s_or_b32 s10, s10, s40 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s40, s45, 16 +; VI-NEXT: s_or_b32 s9, s9, s40 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s40, s44, 16 +; VI-NEXT: s_or_b32 s8, s8, s40 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s40, s43, 16 +; VI-NEXT: s_or_b32 s6, s6, s40 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s40, s42, 16 +; VI-NEXT: s_or_b32 s7, s7, s40 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s15 +; VI-NEXT: v_mov_b32_e32 v17, s14 +; VI-NEXT: v_mov_b32_e32 v18, s13 +; VI-NEXT: v_mov_b32_e32 v19, s12 +; VI-NEXT: v_mov_b32_e32 v20, s11 +; VI-NEXT: v_mov_b32_e32 v21, s10 +; VI-NEXT: v_mov_b32_e32 v22, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: v_mov_b32_e32 v25, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v13i64_to_v52i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s42, s41, 16 +; GFX9-NEXT: s_lshr_b32 s43, s40, 16 +; GFX9-NEXT: s_lshr_b32 s44, s15, 16 +; GFX9-NEXT: s_lshr_b32 s45, s14, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_lshr_b32 s47, s12, 16 +; GFX9-NEXT: s_lshr_b32 s56, s11, 16 +; GFX9-NEXT: s_lshr_b32 s57, s10, 16 +; GFX9-NEXT: s_lshr_b32 s58, s9, 16 +; GFX9-NEXT: s_lshr_b32 s59, s8, 16 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s6, 16 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s26, 16 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s24, 16 +; GFX9-NEXT: s_lshr_b32 s76, s23, 16 +; GFX9-NEXT: s_lshr_b32 s77, s22, 16 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_lshr_b32 s88, s19, 16 +; GFX9-NEXT: s_lshr_b32 s89, s18, 16 +; GFX9-NEXT: s_lshr_b32 s90, s17, 16 +; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s40, s40, 3 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s42, s41, 16 +; GFX9-NEXT: s_lshr_b32 s43, s40, 16 +; GFX9-NEXT: s_lshr_b32 s44, s15, 16 +; GFX9-NEXT: s_lshr_b32 s45, s14, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_lshr_b32 s47, s12, 16 +; GFX9-NEXT: s_lshr_b32 s56, s11, 16 +; GFX9-NEXT: s_lshr_b32 s57, s10, 16 +; GFX9-NEXT: s_lshr_b32 s58, s9, 16 +; GFX9-NEXT: s_lshr_b32 s59, s8, 16 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s6, 16 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s26, 16 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s24, 16 +; GFX9-NEXT: s_lshr_b32 s76, s23, 16 +; GFX9-NEXT: s_lshr_b32 s77, s22, 16 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_lshr_b32 s88, s19, 16 +; GFX9-NEXT: s_lshr_b32 s89, s18, 16 +; GFX9-NEXT: s_lshr_b32 s90, s17, 16 +; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s42 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-TRUE16-LABEL: bitcast_v13i64_to_v52i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v7 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s10, s10, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: .LBB41_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s11 :: v_dual_mov_b32 v19, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s9 :: v_dual_mov_b32 v21, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s7 :: v_dual_mov_b32 v23, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v25, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB41_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB41_2 +; +; GFX11-FAKE16-LABEL: bitcast_v13i64_to_v52i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: s_mov_b32 s78, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-FAKE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s11, s11, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s10, s10, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-FAKE16-NEXT: .LBB41_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v25, s10 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB41_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: s_branch .LBB41_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <13 x i64> %a, splat (i64 3) + %a2 = bitcast <13 x i64> %a1 to <52 x i16> + br label %end + +cmp.false: + %a3 = bitcast <13 x i64> %a to <52 x i16> + br label %end + +end: + %phi = phi <52 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x i16> %phi +} + define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v52i16_to_v13i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v62 -; GCN-NEXT: v_or_b32_e32 v1, v1, v63 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v59 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v58 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v57 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v56 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v47 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v46 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v44 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v43 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v11, v11, v42 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v60 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v45 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v26 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v45 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v0, v62, v0 -; GCN-NEXT: v_or_b32_e32 v1, v63, v1 -; GCN-NEXT: v_or_b32_e32 v2, v61, v2 -; GCN-NEXT: v_or_b32_e32 v3, v59, v3 -; GCN-NEXT: v_or_b32_e32 v4, v58, v4 -; GCN-NEXT: v_or_b32_e32 v5, v57, v5 -; GCN-NEXT: v_or_b32_e32 v6, v56, v6 -; GCN-NEXT: v_or_b32_e32 v7, v47, v7 -; GCN-NEXT: v_or_b32_e32 v8, v46, v8 -; GCN-NEXT: v_or_b32_e32 v9, v44, v9 -; GCN-NEXT: v_or_b32_e32 v10, v43, v10 -; GCN-NEXT: v_or_b32_e32 v11, v42, v11 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v26, v12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v15 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v26, v16 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v26, v17 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v26, v18 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v26, v19 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v26, v20 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v26, v21 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v26, v22 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v26, v23 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v26, v25 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v52i16_to_v13i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v56 +; SI-NEXT: v_or_b32_e32 v5, v5, v47 +; SI-NEXT: v_or_b32_e32 v6, v6, v38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v46 +; SI-NEXT: v_or_b32_e32 v10, v10, v35 +; SI-NEXT: v_or_b32_e32 v11, v11, v45 +; SI-NEXT: v_or_b32_e32 v12, v12, v44 +; SI-NEXT: v_or_b32_e32 v13, v13, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v43 +; SI-NEXT: v_or_b32_e32 v15, v15, v42 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_or_b32_e32 v17, v17, v41 +; SI-NEXT: v_or_b32_e32 v18, v18, v40 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v20, v20, v63 +; SI-NEXT: v_or_b32_e32 v21, v21, v62 +; SI-NEXT: v_or_b32_e32 v22, v22, v61 +; SI-NEXT: v_or_b32_e32 v23, v23, v60 +; SI-NEXT: v_or_b32_e32 v24, v24, v59 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_or_b32_e32 v5, v47, v5 +; SI-NEXT: v_or_b32_e32 v6, v38, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v36, v8 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v10, v35, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v44, v12 +; SI-NEXT: v_or_b32_e32 v13, v34, v13 +; SI-NEXT: v_or_b32_e32 v14, v43, v14 +; SI-NEXT: v_or_b32_e32 v15, v42, v15 +; SI-NEXT: v_or_b32_e32 v16, v33, v16 +; SI-NEXT: v_or_b32_e32 v17, v41, v17 +; SI-NEXT: v_or_b32_e32 v18, v40, v18 +; SI-NEXT: v_or_b32_e32 v19, v32, v19 +; SI-NEXT: v_or_b32_e32 v20, v63, v20 +; SI-NEXT: v_or_b32_e32 v21, v62, v21 +; SI-NEXT: v_or_b32_e32 v22, v61, v22 +; SI-NEXT: v_or_b32_e32 v23, v60, v23 +; SI-NEXT: v_or_b32_e32 v24, v59, v24 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52i16_to_v13i64: ; VI: ; %bb.0: @@ -12105,7 +25565,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v25, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -12186,9 +25646,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB21_2: ; %Flow +; VI-NEXT: .LBB42_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_4 +; VI-NEXT: s_cbranch_execz .LBB42_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v25, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v57 @@ -12269,7 +25729,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v26, 3, v32 ; VI-NEXT: v_add_u16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: .LBB21_4: ; %end +; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -12376,7 +25836,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -12500,9 +25960,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; kill: killed $vgpr26 -; GFX9-NEXT: .LBB21_2: ; %Flow +; GFX9-NEXT: .LBB42_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -12594,7 +26054,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_4: ; %end +; GFX9-NEXT: .LBB42_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -12623,7 +26083,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -12651,7 +26111,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12715,7 +26175,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -12743,7 +26203,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -12763,591 +26223,1696 @@ end: ret <13 x i64> %phi } +define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52i16_to_v13i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: v_mov_b32_e32 v33, v24 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v24, v0, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v62 +; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v57 +; SI-NEXT: v_mov_b32_e32 v57, v32 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v50, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v40 +; SI-NEXT: v_mov_b32_e32 v40, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v49, v38 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v43 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v27 +; SI-NEXT: v_mov_b32_e32 v44, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_mov_b32_e32 v43, v62 +; SI-NEXT: v_mov_b32_e32 v62, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v31, v62 +; SI-NEXT: v_mov_b32_e32 v62, v43 +; SI-NEXT: v_mov_b32_e32 v29, v60 +; SI-NEXT: v_mov_b32_e32 v60, v44 +; SI-NEXT: v_mov_b32_e32 v27, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v43, v46 +; SI-NEXT: v_mov_b32_e32 v44, v47 +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v26, v35 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v49 +; SI-NEXT: v_mov_b32_e32 v49, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v48 +; SI-NEXT: v_mov_b32_e32 v48, v40 +; SI-NEXT: v_mov_b32_e32 v40, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v33, v36 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v50 +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: v_mov_b32_e32 v57, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v52i16_to_v13i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v39, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v35, v9 +; VI-NEXT: v_mov_b32_e32 v34, v8 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v38, v4 +; VI-NEXT: v_mov_b32_e32 v48, v3 +; VI-NEXT: v_mov_b32_e32 v49, v2 +; VI-NEXT: v_mov_b32_e32 v51, v1 +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: v_or_b32_sdwa v14, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_or_b32_sdwa v18, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v20, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v21, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v22, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v50 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v52i16_to_v13i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v11 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v34, v9 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v36, v7 +; GFX9-NEXT: v_mov_b32_e32 v37, v6 +; GFX9-NEXT: v_mov_b32_e32 v38, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v4 +; GFX9-NEXT: v_mov_b32_e32 v48, v3 +; GFX9-NEXT: v_mov_b32_e32 v49, v2 +; GFX9-NEXT: v_mov_b32_e32 v50, v1 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v25 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v13i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v7 :: v_dual_mov_b32 v33, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v3 :: v_dual_mov_b32 v37, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v39, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v39 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v16, 16, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB43_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; +; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v13i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB43_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <52 x i16> %a, splat (i16 3) + %a2 = bitcast <52 x i16> %a1 to <13 x i64> + br label %end + +cmp.false: + %a3 = bitcast <52 x i16> %a to <13 x i64> + br label %end + +end: + %phi = phi <13 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x i64> %phi +} + define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v13i64_to_v52f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v51 -; GCN-NEXT: v_mov_b32_e32 v51, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v53 -; GCN-NEXT: v_mov_b32_e32 v53, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v55 -; GCN-NEXT: v_mov_b32_e32 v55, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_mov_b32_e32 v41, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v52 -; GCN-NEXT: v_mov_b32_e32 v52, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v54 -; GCN-NEXT: v_mov_b32_e32 v54, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v43 -; GCN-NEXT: v_mov_b32_e32 v43, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v45 -; GCN-NEXT: v_mov_b32_e32 v45, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v47 -; GCN-NEXT: v_mov_b32_e32 v47, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v28 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v27 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v62 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v60 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v58 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v32 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v31 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v29 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v63 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v61 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v59 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v56 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v55 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v53 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v51 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v41 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_or_b32_e32 v53, v55, v54 -; GCN-NEXT: v_or_b32_e32 v54, v41, v40 -; GCN-NEXT: v_or_b32_e32 v55, v43, v42 -; GCN-NEXT: v_or_b32_e32 v40, v45, v44 -; GCN-NEXT: v_or_b32_e32 v41, v47, v46 -; GCN-NEXT: v_or_b32_e32 v42, v57, v56 -; GCN-NEXT: v_or_b32_e32 v43, v59, v58 -; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v13i64_to_v52f16: +; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v27 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_mov_b32_e32 v55, v24 +; SI-NEXT: v_mov_b32_e32 v53, v25 +; SI-NEXT: v_mov_b32_e32 v51, v26 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v13i64_to_v52f16: ; VI: ; %bb.0: @@ -13385,7 +27950,7 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -13413,9 +27978,9 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB22_2: ; %Flow +; VI-NEXT: .LBB44_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_4 +; VI-NEXT: s_cbranch_execz .LBB44_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 ; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc @@ -13469,7 +28034,7 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB22_4: ; %end +; VI-NEXT: .LBB44_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 @@ -13566,7 +28131,7 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -13594,9 +28159,9 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB22_2: ; %Flow +; GFX9-NEXT: .LBB44_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_4 +; GFX9-NEXT: s_cbranch_execz .LBB44_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc @@ -13650,7 +28215,7 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB22_4: ; %end +; GFX9-NEXT: .LBB44_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v43, v0, s4 @@ -13694,7 +28259,7 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -13729,7 +28294,7 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: .LBB44_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13766,7 +28331,7 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -13794,9 +28359,9 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB44_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -13857,7 +28422,7 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: .LBB44_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 @@ -13904,633 +28469,1862 @@ end: ret <52 x half> %phi } +define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13i64_to_v52f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_readfirstlane_b32 s40, v1 +; SI-NEXT: v_readfirstlane_b32 s41, v2 +; SI-NEXT: v_readfirstlane_b32 s14, v3 +; SI-NEXT: v_readfirstlane_b32 s15, v4 +; SI-NEXT: v_readfirstlane_b32 s12, v5 +; SI-NEXT: v_readfirstlane_b32 s13, v6 +; SI-NEXT: v_readfirstlane_b32 s10, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v8 +; SI-NEXT: v_readfirstlane_b32 s7, v9 +; SI-NEXT: v_readfirstlane_b32 s8, v10 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v12 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s16, s4, 16 +; SI-NEXT: s_lshr_b32 s17, s5, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s43, s19, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s44, s20, 16 +; SI-NEXT: s_lshr_b32 s45, s21, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s46, s22, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s56, s24, 16 +; SI-NEXT: s_lshr_b32 s57, s25, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s58, s26, 16 +; SI-NEXT: s_lshr_b32 s59, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s60, s28, 16 +; SI-NEXT: s_lshr_b32 s61, s29, 16 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_lshr_b32 s62, s40, 16 +; SI-NEXT: s_lshr_b32 s63, s41, 16 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_lshr_b32 s72, s14, 16 +; SI-NEXT: s_lshr_b32 s73, s15, 16 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_lshr_b32 s74, s12, 16 +; SI-NEXT: s_lshr_b32 s75, s13, 16 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_lshr_b32 s76, s10, 16 +; SI-NEXT: s_lshr_b32 s77, s11, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s78, s7, 16 +; SI-NEXT: s_lshr_b32 s79, s8, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_lshr_b32 s88, s6, 16 +; SI-NEXT: s_lshr_b32 s89, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, s16 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_or_b32_e32 v43, v43, v44 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v43, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v41, v41, v42 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v41, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v55, v55, v40 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v53, v53, v54 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v51, v52, v51 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v49, v50, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v13i64_to_v52f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_readfirstlane_b32 s41, v0 +; VI-NEXT: v_readfirstlane_b32 s40, v1 +; VI-NEXT: v_readfirstlane_b32 s15, v2 +; VI-NEXT: v_readfirstlane_b32 s14, v3 +; VI-NEXT: v_readfirstlane_b32 s13, v4 +; VI-NEXT: v_readfirstlane_b32 s12, v5 +; VI-NEXT: v_readfirstlane_b32 s11, v6 +; VI-NEXT: v_readfirstlane_b32 s10, v7 +; VI-NEXT: v_readfirstlane_b32 s9, v8 +; VI-NEXT: v_readfirstlane_b32 s8, v9 +; VI-NEXT: v_readfirstlane_b32 s6, v10 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v11 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s42, s7, 16 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s45, s9, 16 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: s_lshr_b32 s56, s12, 16 +; VI-NEXT: s_lshr_b32 s57, s13, 16 +; VI-NEXT: s_lshr_b32 s58, s14, 16 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s40, 16 +; VI-NEXT: s_lshr_b32 s61, s41, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 16 +; VI-NEXT: s_lshr_b32 s73, s26, 16 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s24, 16 +; VI-NEXT: s_lshr_b32 s76, s23, 16 +; VI-NEXT: s_lshr_b32 s77, s22, 16 +; VI-NEXT: s_lshr_b32 s78, s21, 16 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s19, 16 +; VI-NEXT: s_lshr_b32 s89, s18, 16 +; VI-NEXT: s_lshr_b32 s90, s17, 16 +; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s41, s41, 3 +; VI-NEXT: s_addc_u32 s40, s40, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s42, s7, 16 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s45, s9, 16 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: s_lshr_b32 s56, s12, 16 +; VI-NEXT: s_lshr_b32 s57, s13, 16 +; VI-NEXT: s_lshr_b32 s58, s14, 16 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s40, 16 +; VI-NEXT: s_lshr_b32 s61, s41, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 16 +; VI-NEXT: s_lshr_b32 s73, s26, 16 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s24, 16 +; VI-NEXT: s_lshr_b32 s76, s23, 16 +; VI-NEXT: s_lshr_b32 s77, s22, 16 +; VI-NEXT: s_lshr_b32 s78, s21, 16 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s19, 16 +; VI-NEXT: s_lshr_b32 s89, s18, 16 +; VI-NEXT: s_lshr_b32 s90, s17, 16 +; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s91, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s90, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s89, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s88, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s79, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s78, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s77, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s76, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s75, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s74, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s73, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s72, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s63, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s62, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s29, s61, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s40, s60, 16 +; VI-NEXT: s_or_b32 s29, s29, s40 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s40, s59, 16 +; VI-NEXT: s_or_b32 s15, s15, s40 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s40, s58, 16 +; VI-NEXT: s_or_b32 s14, s14, s40 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s40, s57, 16 +; VI-NEXT: s_or_b32 s13, s13, s40 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s40, s56, 16 +; VI-NEXT: s_or_b32 s12, s12, s40 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s40, s47, 16 +; VI-NEXT: s_or_b32 s11, s11, s40 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s40, s46, 16 +; VI-NEXT: s_or_b32 s10, s10, s40 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s40, s45, 16 +; VI-NEXT: s_or_b32 s9, s9, s40 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s40, s44, 16 +; VI-NEXT: s_or_b32 s8, s8, s40 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s40, s43, 16 +; VI-NEXT: s_or_b32 s6, s6, s40 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s40, s42, 16 +; VI-NEXT: s_or_b32 s7, s7, s40 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s15 +; VI-NEXT: v_mov_b32_e32 v17, s14 +; VI-NEXT: v_mov_b32_e32 v18, s13 +; VI-NEXT: v_mov_b32_e32 v19, s12 +; VI-NEXT: v_mov_b32_e32 v20, s11 +; VI-NEXT: v_mov_b32_e32 v21, s10 +; VI-NEXT: v_mov_b32_e32 v22, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: v_mov_b32_e32 v25, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v13i64_to_v52f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s42, s41, 16 +; GFX9-NEXT: s_lshr_b32 s43, s40, 16 +; GFX9-NEXT: s_lshr_b32 s44, s15, 16 +; GFX9-NEXT: s_lshr_b32 s45, s14, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_lshr_b32 s47, s12, 16 +; GFX9-NEXT: s_lshr_b32 s56, s11, 16 +; GFX9-NEXT: s_lshr_b32 s57, s10, 16 +; GFX9-NEXT: s_lshr_b32 s58, s9, 16 +; GFX9-NEXT: s_lshr_b32 s59, s8, 16 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s6, 16 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s26, 16 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s24, 16 +; GFX9-NEXT: s_lshr_b32 s76, s23, 16 +; GFX9-NEXT: s_lshr_b32 s77, s22, 16 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_lshr_b32 s88, s19, 16 +; GFX9-NEXT: s_lshr_b32 s89, s18, 16 +; GFX9-NEXT: s_lshr_b32 s90, s17, 16 +; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s40, s40, 3 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s42, s41, 16 +; GFX9-NEXT: s_lshr_b32 s43, s40, 16 +; GFX9-NEXT: s_lshr_b32 s44, s15, 16 +; GFX9-NEXT: s_lshr_b32 s45, s14, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_lshr_b32 s47, s12, 16 +; GFX9-NEXT: s_lshr_b32 s56, s11, 16 +; GFX9-NEXT: s_lshr_b32 s57, s10, 16 +; GFX9-NEXT: s_lshr_b32 s58, s9, 16 +; GFX9-NEXT: s_lshr_b32 s59, s8, 16 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s6, 16 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s26, 16 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s24, 16 +; GFX9-NEXT: s_lshr_b32 s76, s23, 16 +; GFX9-NEXT: s_lshr_b32 s77, s22, 16 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_lshr_b32 s88, s19, 16 +; GFX9-NEXT: s_lshr_b32 s89, s18, 16 +; GFX9-NEXT: s_lshr_b32 s90, s17, 16 +; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s42 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-TRUE16-LABEL: bitcast_v13i64_to_v52f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v7 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s10, s10, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: .LBB45_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s11 :: v_dual_mov_b32 v19, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s9 :: v_dual_mov_b32 v21, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s7 :: v_dual_mov_b32 v23, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v25, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB45_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB45_2 +; +; GFX11-FAKE16-LABEL: bitcast_v13i64_to_v52f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: s_mov_b32 s78, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-FAKE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s11, s11, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s10, s10, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-FAKE16-NEXT: .LBB45_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v25, s10 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB45_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: s_branch .LBB45_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <13 x i64> %a, splat (i64 3) + %a2 = bitcast <13 x i64> %a1 to <52 x half> + br label %end + +cmp.false: + %a3 = bitcast <13 x i64> %a to <52 x half> + br label %end + +end: + %phi = phi <52 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x half> %phi +} + define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v52f16_to_v13i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v33 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v44 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_or_b32_e32 v1, v40, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; GCN-NEXT: v_or_b32_e32 v2, v54, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; GCN-NEXT: v_or_b32_e32 v3, v52, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; GCN-NEXT: v_or_b32_e32 v4, v50, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v45 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v26, v12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v15 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v26, v16 -; GCN-NEXT: v_or_b32_e32 v17, v48, v17 -; GCN-NEXT: v_or_b32_e32 v18, v38, v18 -; GCN-NEXT: v_or_b32_e32 v19, v36, v19 -; GCN-NEXT: v_or_b32_e32 v20, v34, v20 -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_or_b32_e32 v22, v33, v22 -; GCN-NEXT: v_or_b32_e32 v23, v35, v23 -; GCN-NEXT: v_or_b32_e32 v24, v37, v24 -; GCN-NEXT: v_or_b32_e32 v25, v39, v25 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v40 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v54 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v52 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v49 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v48 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v38 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_or_b32_e32 v13, v15, v14 -; GCN-NEXT: v_or_b32_e32 v14, v17, v16 -; GCN-NEXT: v_or_b32_e32 v15, v19, v18 -; GCN-NEXT: v_or_b32_e32 v16, v21, v20 -; GCN-NEXT: v_or_b32_e32 v17, v23, v22 -; GCN-NEXT: v_or_b32_e32 v18, v25, v24 -; GCN-NEXT: v_or_b32_e32 v19, v27, v26 -; GCN-NEXT: v_or_b32_e32 v20, v29, v28 -; GCN-NEXT: v_or_b32_e32 v21, v31, v30 -; GCN-NEXT: v_or_b32_e32 v22, v33, v32 -; GCN-NEXT: v_or_b32_e32 v23, v35, v34 -; GCN-NEXT: v_or_b32_e32 v24, v37, v36 -; GCN-NEXT: v_or_b32_e32 v25, v39, v38 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v52f16_to_v13i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v50, v4 +; SI-NEXT: v_or_b32_e32 v21, v56, v21 +; SI-NEXT: v_or_b32_e32 v22, v46, v22 +; SI-NEXT: v_or_b32_e32 v23, v44, v23 +; SI-NEXT: v_or_b32_e32 v24, v34, v24 +; SI-NEXT: v_or_b32_e32 v25, v32, v25 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v59 +; SI-NEXT: v_or_b32_e32 v20, v58, v20 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v59 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v45 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52f16_to_v13i64: ; VI: ; %bb.0: @@ -14575,7 +30369,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v25, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -14656,9 +30450,9 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB23_2: ; %Flow +; VI-NEXT: .LBB46_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_4 +; VI-NEXT: s_cbranch_execz .LBB46_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v25, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v57, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -14739,7 +30533,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v26, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: .LBB23_4: ; %end +; VI-NEXT: .LBB46_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -14846,7 +30640,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -14970,9 +30764,9 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; kill: killed $vgpr26 -; GFX9-NEXT: .LBB23_2: ; %Flow +; GFX9-NEXT: .LBB46_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -15065,7 +30859,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_4: ; %end +; GFX9-NEXT: .LBB46_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -15094,7 +30888,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -15122,7 +30916,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15186,7 +30980,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -15214,9 +31008,1229 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <52 x half> %a, splat (half 0xH0200) + %a2 = bitcast <52 x half> %a1 to <13 x i64> + br label %end + +cmp.false: + %a3 = bitcast <52 x half> %a to <13 x i64> + br label %end + +end: + %phi = phi <13 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x i64> %phi +} + +define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52f16_to_v13i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_or_b32_e32 v7, v45, v7 +; SI-NEXT: v_or_b32_e32 v8, v40, v8 +; SI-NEXT: v_or_b32_e32 v9, v55, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v47, v11 +; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_or_b32_e32 v13, v52, v13 +; SI-NEXT: v_or_b32_e32 v14, v63, v14 +; SI-NEXT: v_or_b32_e32 v15, v61, v15 +; SI-NEXT: v_or_b32_e32 v17, v35, v17 +; SI-NEXT: v_or_b32_e32 v18, v33, v18 +; SI-NEXT: v_or_b32_e32 v19, v59, v19 +; SI-NEXT: v_or_b32_e32 v20, v27, v20 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v37, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v38, v25 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: v_mov_b32_e32 v50, v63 +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v30 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v32, v59 +; SI-NEXT: v_mov_b32_e32 v59, v31 +; SI-NEXT: v_mov_b32_e32 v48, v61 +; SI-NEXT: v_mov_b32_e32 v61, v26 +; SI-NEXT: v_mov_b32_e32 v49, v62 +; SI-NEXT: v_mov_b32_e32 v62, v27 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v27, v62 +; SI-NEXT: v_mov_b32_e32 v62, v49 +; SI-NEXT: v_mov_b32_e32 v26, v61 +; SI-NEXT: v_mov_b32_e32 v61, v48 +; SI-NEXT: v_mov_b32_e32 v31, v59 +; SI-NEXT: v_mov_b32_e32 v59, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v30, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v63, v50 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v52f16_to_v13i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v32, v11 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v6 +; VI-NEXT: v_mov_b32_e32 v38, v5 +; VI-NEXT: v_mov_b32_e32 v39, v4 +; VI-NEXT: v_mov_b32_e32 v48, v3 +; VI-NEXT: v_mov_b32_e32 v49, v2 +; VI-NEXT: v_mov_b32_e32 v50, v1 +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB47_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v25, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v51, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v50, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v49, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v39, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v38, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v37, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v36, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v35, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v34, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v33, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v24, v26, v24 +; VI-NEXT: v_add_f16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v52f16_to_v13i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v11 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v34, v9 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v36, v7 +; GFX9-NEXT: v_mov_b32_e32 v37, v6 +; GFX9-NEXT: v_mov_b32_e32 v38, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v4 +; GFX9-NEXT: v_mov_b32_e32 v48, v3 +; GFX9-NEXT: v_mov_b32_e32 v49, v2 +; GFX9-NEXT: v_mov_b32_e32 v50, v1 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v25 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB47_3: ; %end +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB47_2 +; +; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v13i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v7 :: v_dual_mov_b32 v33, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v3 :: v_dual_mov_b32 v37, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v39, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v39 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v16, 16, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB47_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; +; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v13i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB47_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -15235,273 +32249,288 @@ end: } define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v13f64_to_v52i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v32, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v35, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v53, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v54, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v55, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v32, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v35, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v53, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v54, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v55, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_and_b32_e32 v47, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 28, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v40, v45, v40 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 32, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v44, v46, v44 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v55, v47, v55 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 40, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v43, v56, v43 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v54, v57, v54 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v42, v58, v42 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 52, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v7, v7, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 56, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v8, v8, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v9, v9, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v10, v10, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v11, v11, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v12, v12, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x4c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v13, v13, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x50, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v14, v14, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x54, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v15, v15, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x58, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v16, v16, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v17, v17, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v18, v18, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x64, v0 -; GCN-NEXT: v_or_b32_e32 v19, v19, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v39 -; GCN-NEXT: v_or_b32_e32 v21, v21, v29 -; GCN-NEXT: v_or_b32_e32 v22, v22, v38 -; GCN-NEXT: v_or_b32_e32 v23, v23, v28 -; GCN-NEXT: v_or_b32_e32 v24, v24, v37 -; GCN-NEXT: v_or_b32_e32 v25, v25, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v13f64_to_v52i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v32, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v39, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v49, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v32, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v39, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v49, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v13f64_to_v52i16: ; VI: ; %bb.0: @@ -15539,7 +32568,7 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -15567,9 +32596,9 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -15610,7 +32639,7 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 @@ -15707,7 +32736,7 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -15735,9 +32764,9 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -15778,7 +32807,7 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v43, v0, s4 @@ -15822,7 +32851,7 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -15837,7 +32866,7 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: .LBB48_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15874,7 +32903,7 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -15902,9 +32931,9 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -15945,7 +32974,7 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 @@ -15992,470 +33021,1495 @@ end: ret <52 x i16> %phi } +define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13f64_to_v52i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v25, s16 +; SI-NEXT: v_mov_b32_e32 v26, s17 +; SI-NEXT: v_mov_b32_e32 v23, s18 +; SI-NEXT: v_mov_b32_e32 v24, s19 +; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_mov_b32_e32 v21, s22 +; SI-NEXT: v_mov_b32_e32 v22, s23 +; SI-NEXT: v_mov_b32_e32 v17, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_mov_b32_e32 v16, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s28 +; SI-NEXT: v_mov_b32_e32 v14, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v29, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v39, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v52, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v54, v24, v23, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v26, v25, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v26 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_alignbit_b32 v27, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v29, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v39, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v52, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v54, v24, v23, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v26, v25, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v26 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v25, v25, v40 +; SI-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v44 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v43 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_or_b32_e32 v19, v19, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v49 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v41 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v39 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v13f64_to_v52i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v21, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 +; VI-NEXT: v_mov_b32_e32 v13, s20 +; VI-NEXT: v_mov_b32_e32 v14, s21 +; VI-NEXT: v_mov_b32_e32 v30, s22 +; VI-NEXT: v_mov_b32_e32 v31, s23 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s25 +; VI-NEXT: v_mov_b32_e32 v19, s26 +; VI-NEXT: v_mov_b32_e32 v20, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v21, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 +; VI-NEXT: v_or_b32_sdwa v30, v30, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 +; VI-NEXT: v_or_b32_sdwa v31, v31, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v19, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v40 +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v36 +; VI-NEXT: v_mov_b32_e32 v1, v37 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v30 +; VI-NEXT: v_mov_b32_e32 v7, v31 +; VI-NEXT: v_mov_b32_e32 v8, v32 +; VI-NEXT: v_mov_b32_e32 v9, v33 +; VI-NEXT: v_mov_b32_e32 v10, v34 +; VI-NEXT: v_mov_b32_e32 v11, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v13f64_to_v52i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v21, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v18, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s20 +; GFX9-NEXT: v_mov_b32_e32 v14, s21 +; GFX9-NEXT: v_mov_b32_e32 v30, s22 +; GFX9-NEXT: v_mov_b32_e32 v31, s23 +; GFX9-NEXT: v_mov_b32_e32 v23, s24 +; GFX9-NEXT: v_mov_b32_e32 v24, s25 +; GFX9-NEXT: v_mov_b32_e32 v19, s26 +; GFX9-NEXT: v_mov_b32_e32 v20, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v15, s28 +; GFX9-NEXT: v_mov_b32_e32 v16, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v30 +; GFX9-NEXT: v_lshl_or_b32 v30, v43, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v31 +; GFX9-NEXT: v_lshl_or_b32 v31, v42, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v12, v12, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v41, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v14, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v15, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v16, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v19, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v20, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v21, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v38, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v36 +; GFX9-NEXT: v_mov_b32_e32 v1, v37 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v30 +; GFX9-NEXT: v_mov_b32_e32 v7, v31 +; GFX9-NEXT: v_mov_b32_e32 v8, v32 +; GFX9-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-NEXT: v_mov_b32_e32 v10, v34 +; GFX9-NEXT: v_mov_b32_e32 v11, v35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: s_branch .LBB49_2 +; +; GFX11-TRUE16-LABEL: bitcast_v13f64_to_v52i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: .LBB49_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v65, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v64, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v66, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v53, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v52, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v55, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v50, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; +; GFX11-FAKE16-LABEL: bitcast_v13f64_to_v52i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: .LBB49_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v64, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v66, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v52, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v55, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v15, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <13 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <13 x double> %a1 to <52 x i16> + br label %end + +cmp.false: + %a3 = bitcast <13 x double> %a to <52 x i16> + br label %end + +end: + %phi = phi <52 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x i16> %phi +} + define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v52i16_to_v13f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v62 -; GCN-NEXT: v_or_b32_e32 v1, v1, v63 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v59 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v58 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v57 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v56 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v47 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v46 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v44 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v43 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v11, v11, v42 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v60 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v45 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v26 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v45 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v0, v62, v0 -; GCN-NEXT: v_or_b32_e32 v1, v63, v1 -; GCN-NEXT: v_or_b32_e32 v2, v61, v2 -; GCN-NEXT: v_or_b32_e32 v3, v59, v3 -; GCN-NEXT: v_or_b32_e32 v4, v58, v4 -; GCN-NEXT: v_or_b32_e32 v5, v57, v5 -; GCN-NEXT: v_or_b32_e32 v6, v56, v6 -; GCN-NEXT: v_or_b32_e32 v7, v47, v7 -; GCN-NEXT: v_or_b32_e32 v8, v46, v8 -; GCN-NEXT: v_or_b32_e32 v9, v44, v9 -; GCN-NEXT: v_or_b32_e32 v10, v43, v10 -; GCN-NEXT: v_or_b32_e32 v11, v42, v11 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v26, v12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v15 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v26, v16 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v26, v17 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v26, v18 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v26, v19 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v26, v20 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v26, v21 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v26, v22 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v26, v23 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v26, v25 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v52i16_to_v13f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v56 +; SI-NEXT: v_or_b32_e32 v5, v5, v47 +; SI-NEXT: v_or_b32_e32 v6, v6, v38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v46 +; SI-NEXT: v_or_b32_e32 v10, v10, v35 +; SI-NEXT: v_or_b32_e32 v11, v11, v45 +; SI-NEXT: v_or_b32_e32 v12, v12, v44 +; SI-NEXT: v_or_b32_e32 v13, v13, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v43 +; SI-NEXT: v_or_b32_e32 v15, v15, v42 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_or_b32_e32 v17, v17, v41 +; SI-NEXT: v_or_b32_e32 v18, v18, v40 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v20, v20, v63 +; SI-NEXT: v_or_b32_e32 v21, v21, v62 +; SI-NEXT: v_or_b32_e32 v22, v22, v61 +; SI-NEXT: v_or_b32_e32 v23, v23, v60 +; SI-NEXT: v_or_b32_e32 v24, v24, v59 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_or_b32_e32 v5, v47, v5 +; SI-NEXT: v_or_b32_e32 v6, v38, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v36, v8 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v10, v35, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v44, v12 +; SI-NEXT: v_or_b32_e32 v13, v34, v13 +; SI-NEXT: v_or_b32_e32 v14, v43, v14 +; SI-NEXT: v_or_b32_e32 v15, v42, v15 +; SI-NEXT: v_or_b32_e32 v16, v33, v16 +; SI-NEXT: v_or_b32_e32 v17, v41, v17 +; SI-NEXT: v_or_b32_e32 v18, v40, v18 +; SI-NEXT: v_or_b32_e32 v19, v32, v19 +; SI-NEXT: v_or_b32_e32 v20, v63, v20 +; SI-NEXT: v_or_b32_e32 v21, v62, v21 +; SI-NEXT: v_or_b32_e32 v22, v61, v22 +; SI-NEXT: v_or_b32_e32 v23, v60, v23 +; SI-NEXT: v_or_b32_e32 v24, v59, v24 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52i16_to_v13f64: ; VI: ; %bb.0: @@ -16500,7 +34554,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v25, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -16581,9 +34635,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v25, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v57 @@ -16664,7 +34718,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v26, 3, v32 ; VI-NEXT: v_add_u16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -16771,7 +34825,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -16895,9 +34949,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; kill: killed $vgpr26 -; GFX9-NEXT: .LBB25_2: ; %Flow +; GFX9-NEXT: .LBB50_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 +; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -16989,7 +35043,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: .LBB50_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -17018,7 +35072,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -17046,7 +35100,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17110,7 +35164,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -17138,9 +35192,1125 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <52 x i16> %a, splat (i16 3) + %a2 = bitcast <52 x i16> %a1 to <13 x double> + br label %end + +cmp.false: + %a3 = bitcast <52 x i16> %a to <13 x double> + br label %end + +end: + %phi = phi <13 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x double> %phi +} + +define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52i16_to_v13f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: v_mov_b32_e32 v33, v24 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v24, v0, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v62 +; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v57 +; SI-NEXT: v_mov_b32_e32 v57, v32 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v50, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v40 +; SI-NEXT: v_mov_b32_e32 v40, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v49, v38 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v43 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v27 +; SI-NEXT: v_mov_b32_e32 v44, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_mov_b32_e32 v43, v62 +; SI-NEXT: v_mov_b32_e32 v62, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v31, v62 +; SI-NEXT: v_mov_b32_e32 v62, v43 +; SI-NEXT: v_mov_b32_e32 v29, v60 +; SI-NEXT: v_mov_b32_e32 v60, v44 +; SI-NEXT: v_mov_b32_e32 v27, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v43, v46 +; SI-NEXT: v_mov_b32_e32 v44, v47 +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v26, v35 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v49 +; SI-NEXT: v_mov_b32_e32 v49, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v48 +; SI-NEXT: v_mov_b32_e32 v48, v40 +; SI-NEXT: v_mov_b32_e32 v40, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v33, v36 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v50 +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: v_mov_b32_e32 v57, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v52i16_to_v13f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v39, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v35, v9 +; VI-NEXT: v_mov_b32_e32 v34, v8 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v38, v4 +; VI-NEXT: v_mov_b32_e32 v48, v3 +; VI-NEXT: v_mov_b32_e32 v49, v2 +; VI-NEXT: v_mov_b32_e32 v51, v1 +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: v_or_b32_sdwa v14, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_or_b32_sdwa v18, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v20, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v21, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v22, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v50 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v52i16_to_v13f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v11 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v34, v9 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v36, v7 +; GFX9-NEXT: v_mov_b32_e32 v37, v6 +; GFX9-NEXT: v_mov_b32_e32 v38, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v4 +; GFX9-NEXT: v_mov_b32_e32 v48, v3 +; GFX9-NEXT: v_mov_b32_e32 v49, v2 +; GFX9-NEXT: v_mov_b32_e32 v50, v1 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v25 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v13f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v7 :: v_dual_mov_b32 v33, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v3 :: v_dual_mov_b32 v37, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v39, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v39 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v16, 16, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v13f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -17159,564 +36329,552 @@ end: } define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v13f64_to_v52f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v51 -; GCN-NEXT: v_mov_b32_e32 v51, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v53 -; GCN-NEXT: v_mov_b32_e32 v53, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v55 -; GCN-NEXT: v_mov_b32_e32 v55, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_mov_b32_e32 v41, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v52 -; GCN-NEXT: v_mov_b32_e32 v52, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v54 -; GCN-NEXT: v_mov_b32_e32 v54, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v43 -; GCN-NEXT: v_mov_b32_e32 v43, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v45 -; GCN-NEXT: v_mov_b32_e32 v45, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v47 -; GCN-NEXT: v_mov_b32_e32 v47, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v28 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v27 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v62 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v60 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v58 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v32 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v31 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v29 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v63 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v61 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v59 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v56 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v55 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v53 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v51 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v41 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_or_b32_e32 v53, v55, v54 -; GCN-NEXT: v_or_b32_e32 v54, v41, v40 -; GCN-NEXT: v_or_b32_e32 v55, v43, v42 -; GCN-NEXT: v_or_b32_e32 v40, v45, v44 -; GCN-NEXT: v_or_b32_e32 v41, v47, v46 -; GCN-NEXT: v_or_b32_e32 v42, v57, v56 -; GCN-NEXT: v_or_b32_e32 v43, v59, v58 -; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v13f64_to_v52f16: +; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v27 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_mov_b32_e32 v55, v24 +; SI-NEXT: v_mov_b32_e32 v53, v25 +; SI-NEXT: v_mov_b32_e32 v51, v26 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v13f64_to_v52f16: ; VI: ; %bb.0: @@ -17754,7 +36912,7 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -17782,9 +36940,9 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB26_2: ; %Flow +; VI-NEXT: .LBB52_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_4 +; VI-NEXT: s_cbranch_execz .LBB52_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -17825,7 +36983,7 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB26_4: ; %end +; VI-NEXT: .LBB52_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 @@ -17922,7 +37080,7 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -17950,9 +37108,9 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB26_2: ; %Flow +; GFX9-NEXT: .LBB52_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_4 +; GFX9-NEXT: s_cbranch_execz .LBB52_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -17993,7 +37151,7 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB26_4: ; %end +; GFX9-NEXT: .LBB52_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v43, v0, s4 @@ -18037,7 +37195,7 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -18052,7 +37210,7 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18089,7 +37247,7 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -18117,9 +37275,9 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -18160,7 +37318,7 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_4: ; %end +; GFX11-FAKE16-NEXT: .LBB52_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 @@ -18207,633 +37365,1850 @@ end: ret <52 x half> %phi } +define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13f64_to_v52f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_readfirstlane_b32 s14, v1 +; SI-NEXT: v_readfirstlane_b32 s15, v2 +; SI-NEXT: v_readfirstlane_b32 s12, v3 +; SI-NEXT: v_readfirstlane_b32 s13, v4 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_readfirstlane_b32 s11, v6 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: v_readfirstlane_b32 s9, v8 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: v_readfirstlane_b32 s7, v10 +; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: s_and_b64 s[40:41], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v12 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s40, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 +; SI-NEXT: s_lshr_b32 s40, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s40 +; SI-NEXT: s_lshr_b32 s40, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s40 +; SI-NEXT: s_lshr_b32 s40, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s40 +; SI-NEXT: s_lshr_b32 s40, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s40 +; SI-NEXT: s_lshr_b32 s40, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s40 +; SI-NEXT: s_lshr_b32 s40, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 +; SI-NEXT: s_lshr_b32 s40, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s40 +; SI-NEXT: s_lshr_b32 s40, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s40 +; SI-NEXT: s_lshr_b32 s40, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 +; SI-NEXT: s_lshr_b32 s40, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s40 +; SI-NEXT: s_lshr_b32 s40, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s40 +; SI-NEXT: s_lshr_b32 s40, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s40 +; SI-NEXT: s_lshr_b32 s40, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s40 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s40 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s40 +; SI-NEXT: s_lshr_b32 s40, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s40 +; SI-NEXT: s_lshr_b32 s40, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s40 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s40 +; SI-NEXT: s_lshr_b32 s40, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s40 +; SI-NEXT: s_lshr_b32 s40, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s40 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s40 +; SI-NEXT: s_lshr_b32 s40, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s40 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s40 +; SI-NEXT: s_lshr_b32 s40, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[54:55], s[18:19], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v2 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v42 +; SI-NEXT: v_add_f64 v[49:50], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[37:38], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[33:34], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[31:32], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[26:27], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0 +; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v51, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v10 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 +; SI-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v13, v41, v13 +; SI-NEXT: buffer_store_dword v13, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v10, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v55 +; SI-NEXT: v_add_i32_e32 v55, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v53 +; SI-NEXT: v_add_i32_e32 v53, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 +; SI-NEXT: v_add_i32_e32 v51, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 +; SI-NEXT: v_add_i32_e32 v49, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v48 +; SI-NEXT: v_add_i32_e32 v39, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 +; SI-NEXT: v_add_i32_e32 v37, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 +; SI-NEXT: v_add_i32_e32 v35, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v34 +; SI-NEXT: v_add_i32_e32 v33, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 +; SI-NEXT: v_add_i32_e32 v31, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 +; SI-NEXT: v_add_i32_e32 v28, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 +; SI-NEXT: v_add_i32_e32 v26, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v27 +; SI-NEXT: v_add_i32_e32 v24, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v25 +; SI-NEXT: v_add_i32_e32 v22, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 +; SI-NEXT: v_add_i32_e32 v20, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v21 +; SI-NEXT: v_add_i32_e32 v18, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v45 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v13f64_to_v52f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v21, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 +; VI-NEXT: v_mov_b32_e32 v13, s20 +; VI-NEXT: v_mov_b32_e32 v14, s21 +; VI-NEXT: v_mov_b32_e32 v30, s22 +; VI-NEXT: v_mov_b32_e32 v31, s23 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s25 +; VI-NEXT: v_mov_b32_e32 v19, s26 +; VI-NEXT: v_mov_b32_e32 v20, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v21, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 +; VI-NEXT: v_or_b32_sdwa v30, v30, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 +; VI-NEXT: v_or_b32_sdwa v31, v31, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v19, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v40 +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v36 +; VI-NEXT: v_mov_b32_e32 v1, v37 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v30 +; VI-NEXT: v_mov_b32_e32 v7, v31 +; VI-NEXT: v_mov_b32_e32 v8, v32 +; VI-NEXT: v_mov_b32_e32 v9, v33 +; VI-NEXT: v_mov_b32_e32 v10, v34 +; VI-NEXT: v_mov_b32_e32 v11, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v13f64_to_v52f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v21, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v18, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s20 +; GFX9-NEXT: v_mov_b32_e32 v14, s21 +; GFX9-NEXT: v_mov_b32_e32 v30, s22 +; GFX9-NEXT: v_mov_b32_e32 v31, s23 +; GFX9-NEXT: v_mov_b32_e32 v23, s24 +; GFX9-NEXT: v_mov_b32_e32 v24, s25 +; GFX9-NEXT: v_mov_b32_e32 v19, s26 +; GFX9-NEXT: v_mov_b32_e32 v20, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v15, s28 +; GFX9-NEXT: v_mov_b32_e32 v16, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v30 +; GFX9-NEXT: v_lshl_or_b32 v30, v43, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v31 +; GFX9-NEXT: v_lshl_or_b32 v31, v42, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v12, v12, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v41, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v14, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v15, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v16, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v19, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v20, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v21, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v38, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v36 +; GFX9-NEXT: v_mov_b32_e32 v1, v37 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v30 +; GFX9-NEXT: v_mov_b32_e32 v7, v31 +; GFX9-NEXT: v_mov_b32_e32 v8, v32 +; GFX9-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-NEXT: v_mov_b32_e32 v10, v34 +; GFX9-NEXT: v_mov_b32_e32 v11, v35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-TRUE16-LABEL: bitcast_v13f64_to_v52f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: .LBB53_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v65, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v64, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v66, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v53, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v52, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v55, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v50, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB53_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB53_2 +; +; GFX11-FAKE16-LABEL: bitcast_v13f64_to_v52f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: .LBB53_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v64, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v66, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v52, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v55, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v15, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB53_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <13 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <13 x double> %a1 to <52 x half> + br label %end + +cmp.false: + %a3 = bitcast <13 x double> %a to <52 x half> + br label %end + +end: + %phi = phi <52 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x half> %phi +} + define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v52f16_to_v13f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v33 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v44 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_or_b32_e32 v1, v40, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; GCN-NEXT: v_or_b32_e32 v2, v54, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; GCN-NEXT: v_or_b32_e32 v3, v52, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; GCN-NEXT: v_or_b32_e32 v4, v50, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v45 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v26, v12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v15 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v26, v16 -; GCN-NEXT: v_or_b32_e32 v17, v48, v17 -; GCN-NEXT: v_or_b32_e32 v18, v38, v18 -; GCN-NEXT: v_or_b32_e32 v19, v36, v19 -; GCN-NEXT: v_or_b32_e32 v20, v34, v20 -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_or_b32_e32 v22, v33, v22 -; GCN-NEXT: v_or_b32_e32 v23, v35, v23 -; GCN-NEXT: v_or_b32_e32 v24, v37, v24 -; GCN-NEXT: v_or_b32_e32 v25, v39, v25 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v40 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v54 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v52 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v49 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v48 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v38 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_or_b32_e32 v13, v15, v14 -; GCN-NEXT: v_or_b32_e32 v14, v17, v16 -; GCN-NEXT: v_or_b32_e32 v15, v19, v18 -; GCN-NEXT: v_or_b32_e32 v16, v21, v20 -; GCN-NEXT: v_or_b32_e32 v17, v23, v22 -; GCN-NEXT: v_or_b32_e32 v18, v25, v24 -; GCN-NEXT: v_or_b32_e32 v19, v27, v26 -; GCN-NEXT: v_or_b32_e32 v20, v29, v28 -; GCN-NEXT: v_or_b32_e32 v21, v31, v30 -; GCN-NEXT: v_or_b32_e32 v22, v33, v32 -; GCN-NEXT: v_or_b32_e32 v23, v35, v34 -; GCN-NEXT: v_or_b32_e32 v24, v37, v36 -; GCN-NEXT: v_or_b32_e32 v25, v39, v38 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v52f16_to_v13f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v50, v4 +; SI-NEXT: v_or_b32_e32 v21, v56, v21 +; SI-NEXT: v_or_b32_e32 v22, v46, v22 +; SI-NEXT: v_or_b32_e32 v23, v44, v23 +; SI-NEXT: v_or_b32_e32 v24, v34, v24 +; SI-NEXT: v_or_b32_e32 v25, v32, v25 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v59 +; SI-NEXT: v_or_b32_e32 v20, v58, v20 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v59 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v45 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52f16_to_v13f64: ; VI: ; %bb.0: @@ -18878,7 +39253,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v25, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -18959,9 +39334,9 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: .LBB54_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_4 +; VI-NEXT: s_cbranch_execz .LBB54_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v25, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v57, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -19042,7 +39417,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v26, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: .LBB27_4: ; %end +; VI-NEXT: .LBB54_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -19149,7 +39524,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -19273,9 +39648,9 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; kill: killed $vgpr26 -; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: .LBB54_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -19368,7 +39743,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_4: ; %end +; GFX9-NEXT: .LBB54_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -19397,7 +39772,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -19425,7 +39800,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: .LBB54_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19489,7 +39864,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -19517,9 +39892,1229 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB27_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB54_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <52 x half> %a, splat (half 0xH0200) + %a2 = bitcast <52 x half> %a1 to <13 x double> + br label %end + +cmp.false: + %a3 = bitcast <52 x half> %a to <13 x double> + br label %end + +end: + %phi = phi <13 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x double> %phi +} + +define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52f16_to_v13f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_or_b32_e32 v7, v45, v7 +; SI-NEXT: v_or_b32_e32 v8, v40, v8 +; SI-NEXT: v_or_b32_e32 v9, v55, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v47, v11 +; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_or_b32_e32 v13, v52, v13 +; SI-NEXT: v_or_b32_e32 v14, v63, v14 +; SI-NEXT: v_or_b32_e32 v15, v61, v15 +; SI-NEXT: v_or_b32_e32 v17, v35, v17 +; SI-NEXT: v_or_b32_e32 v18, v33, v18 +; SI-NEXT: v_or_b32_e32 v19, v59, v19 +; SI-NEXT: v_or_b32_e32 v20, v27, v20 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v37, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v38, v25 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v50, v63 +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v30 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v32, v59 +; SI-NEXT: v_mov_b32_e32 v59, v31 +; SI-NEXT: v_mov_b32_e32 v48, v61 +; SI-NEXT: v_mov_b32_e32 v61, v26 +; SI-NEXT: v_mov_b32_e32 v49, v62 +; SI-NEXT: v_mov_b32_e32 v62, v27 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v27, v62 +; SI-NEXT: v_mov_b32_e32 v62, v49 +; SI-NEXT: v_mov_b32_e32 v26, v61 +; SI-NEXT: v_mov_b32_e32 v61, v48 +; SI-NEXT: v_mov_b32_e32 v31, v59 +; SI-NEXT: v_mov_b32_e32 v59, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v30, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v63, v50 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v52f16_to_v13f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v32, v11 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v6 +; VI-NEXT: v_mov_b32_e32 v38, v5 +; VI-NEXT: v_mov_b32_e32 v39, v4 +; VI-NEXT: v_mov_b32_e32 v48, v3 +; VI-NEXT: v_mov_b32_e32 v49, v2 +; VI-NEXT: v_mov_b32_e32 v50, v1 +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v25, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v51, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v50, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v49, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v39, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v38, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v37, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v36, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v35, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v34, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v33, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v24, v26, v24 +; VI-NEXT: v_add_f16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v52f16_to_v13f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v11 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v34, v9 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v36, v7 +; GFX9-NEXT: v_mov_b32_e32 v37, v6 +; GFX9-NEXT: v_mov_b32_e32 v38, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v4 +; GFX9-NEXT: v_mov_b32_e32 v48, v3 +; GFX9-NEXT: v_mov_b32_e32 v49, v2 +; GFX9-NEXT: v_mov_b32_e32 v50, v1 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v25 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v13f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v7 :: v_dual_mov_b32 v33, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v3 :: v_dual_mov_b32 v37, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v39, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v39 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v16, 16, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB55_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB55_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB55_2 +; +; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v13f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB55_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB55_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB55_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -19538,790 +41133,807 @@ end: } define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v52i16_to_v52f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v42 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v42 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v51 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v52 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v53 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v54 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v55 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v40 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v41 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v49 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v48 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_or_b32_e32 v53, v55, v54 -; GCN-NEXT: v_or_b32_e32 v54, v41, v40 -; GCN-NEXT: v_or_b32_e32 v55, v43, v42 -; GCN-NEXT: v_or_b32_e32 v40, v45, v44 -; GCN-NEXT: v_or_b32_e32 v41, v47, v46 -; GCN-NEXT: v_or_b32_e32 v42, v57, v56 -; GCN-NEXT: v_or_b32_e32 v43, v59, v58 -; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v52i16_to_v52f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v56 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52i16_to_v52f16: ; VI: ; %bb.0: @@ -20361,7 +41973,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v44, 3, v44 @@ -20415,7 +42027,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v28, 3, v28 ; VI-NEXT: v_add_u16_e32 v25, 3, v25 ; VI-NEXT: v_add_u16_e32 v27, 3, v27 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v44 ; VI-NEXT: v_or_b32_sdwa v0, v0, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -20515,7 +42127,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v25, v44, v25, s6 @@ -20596,7 +42208,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v25 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v21, v40, v21, s4 @@ -20641,7 +42253,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] @@ -20669,7 +42281,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: .LBB56_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -20707,7 +42319,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v25, v68, v25, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v24, v67, v24, 0x5040100 @@ -20787,7 +42399,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v25 -; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: .LBB56_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v27, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v28, v1, 0x5040100 @@ -20833,503 +42445,2160 @@ end: ret <52 x half> %phi } +define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52i16_to_v52f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB57_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s25 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v19 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v28 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v57 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v43 +; SI-NEXT: s_branch .LBB57_3 +; SI-NEXT: .LBB57_2: +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: .LBB57_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v62 +; SI-NEXT: v_mov_b32_e32 v62, v32 +; SI-NEXT: v_mov_b32_e32 v32, v37 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v51 +; SI-NEXT: v_mov_b32_e32 v51, v53 +; SI-NEXT: v_mov_b32_e32 v53, v55 +; SI-NEXT: v_mov_b32_e32 v55, v41 +; SI-NEXT: v_mov_b32_e32 v41, v42 +; SI-NEXT: s_cbranch_vccnz .LBB57_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v43 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v32, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v42 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v54, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v41, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: .LBB57_5: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v52i16_to_v52f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: s_or_b32 s15, s18, s15 +; VI-NEXT: s_and_b32 s18, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: s_or_b32 s14, s18, s14 +; VI-NEXT: s_and_b32 s18, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: s_or_b32 s13, s18, s13 +; VI-NEXT: s_and_b32 s18, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; VI-NEXT: s_or_b32 s12, s18, s12 +; VI-NEXT: s_and_b32 s18, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: s_or_b32 s11, s18, s11 +; VI-NEXT: s_and_b32 s18, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: s_or_b32 s10, s18, s10 +; VI-NEXT: s_and_b32 s18, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; VI-NEXT: s_or_b32 s9, s18, s9 +; VI-NEXT: s_and_b32 s18, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: s_or_b32 s8, s18, s8 +; VI-NEXT: s_and_b32 s18, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: s_or_b32 s7, s18, s7 +; VI-NEXT: s_and_b32 s18, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: s_or_b32 s6, s18, s6 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v52i16_to_v52f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_pk_add_u16 v13, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_u16 v12, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_u16 v35, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_u16 v34, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_u16 v33, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_u16 v32, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_u16 v31, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_u16 v30, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_u16 v29, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_u16 v28, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pk_add_u16 v26, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_lshl_or_b32 v11, v25, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v10, v24, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v23, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v8, v22, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v37, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v36, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: s_branch .LBB57_5 +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v35, s27 +; GFX9-NEXT: v_mov_b32_e32 v34, s26 +; GFX9-NEXT: v_mov_b32_e32 v33, s25 +; GFX9-NEXT: v_mov_b32_e32 v32, s24 +; GFX9-NEXT: v_mov_b32_e32 v31, s23 +; GFX9-NEXT: v_mov_b32_e32 v30, s22 +; GFX9-NEXT: v_mov_b32_e32 v29, s21 +; GFX9-NEXT: v_mov_b32_e32 v28, s20 +; GFX9-NEXT: v_mov_b32_e32 v27, s19 +; GFX9-NEXT: v_mov_b32_e32 v26, s18 +; GFX9-NEXT: v_mov_b32_e32 v37, s17 +; GFX9-NEXT: v_mov_b32_e32 v36, s16 +; GFX9-NEXT: v_mov_b32_e32 v38, s43 +; GFX9-NEXT: v_mov_b32_e32 v39, s42 +; GFX9-NEXT: v_mov_b32_e32 v48, s41 +; GFX9-NEXT: v_mov_b32_e32 v49, s40 +; GFX9-NEXT: v_mov_b32_e32 v50, s15 +; GFX9-NEXT: v_mov_b32_e32 v51, s14 +; GFX9-NEXT: v_mov_b32_e32 v52, s13 +; GFX9-NEXT: v_mov_b32_e32 v53, s12 +; GFX9-NEXT: v_mov_b32_e32 v54, s11 +; GFX9-NEXT: v_mov_b32_e32 v55, s10 +; GFX9-NEXT: v_mov_b32_e32 v40, s9 +; GFX9-NEXT: v_mov_b32_e32 v41, s8 +; GFX9-NEXT: v_mov_b32_e32 v42, s7 +; GFX9-NEXT: v_mov_b32_e32 v43, s6 +; GFX9-NEXT: .LBB57_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_lshl_or_b32 v36, v43, 16, v36 +; GFX9-NEXT: v_lshl_or_b32 v37, v42, 16, v37 +; GFX9-NEXT: v_lshl_or_b32 v26, v41, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v40, 16, v27 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v28, v55, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v54, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v30, v53, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v31, v52, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v32, v51, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v33, v50, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v34, v49, 16, v34 +; GFX9-NEXT: v_lshl_or_b32 v35, v48, 16, v35 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v38, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v36 +; GFX9-NEXT: v_mov_b32_e32 v1, v37 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v30 +; GFX9-NEXT: v_mov_b32_e32 v7, v31 +; GFX9-NEXT: v_mov_b32_e32 v8, v32 +; GFX9-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-NEXT: v_mov_b32_e32 v10, v34 +; GFX9-NEXT: v_mov_b32_e32 v11, v35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v52f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v8, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v10, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v8, 16, v3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v9, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v10, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v11, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s15, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s14, s11 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s12, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s11, s7 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v52.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v53.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v65.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v67.l +; GFX11-TRUE16-NEXT: s_branch .LBB57_5 +; GFX11-TRUE16-NEXT: .LBB57_3: +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s10 +; GFX11-TRUE16-NEXT: .LBB57_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v25, 16, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v51, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v36, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v23, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v24, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v38, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v34, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v35, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v48, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v19, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v23, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v50, 16, v32 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v39, 16, v28 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v49, 16, v52 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v37, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v52f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v30, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v31, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v33, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: s_branch .LBB57_5 +; GFX11-FAKE16-NEXT: .LBB57_3: +; GFX11-FAKE16-NEXT: s_branch .LBB57_2 +; GFX11-FAKE16-NEXT: .LBB57_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s29 :: v_dual_mov_b32 v17, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s27 :: v_dual_mov_b32 v12, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s23 :: v_dual_mov_b32 v8, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s19 :: v_dual_mov_b32 v31, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s17 :: v_dual_mov_b32 v27, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s3 :: v_dual_mov_b32 v29, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v33, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s43 :: v_dual_mov_b32 v37, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s41 :: v_dual_mov_b32 v39, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s15 :: v_dual_mov_b32 v49, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s13 :: v_dual_mov_b32 v51, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s11 :: v_dual_mov_b32 v53, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v55, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s6 :: v_dual_mov_b32 v65, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s5 +; GFX11-FAKE16-NEXT: .LBB57_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v67, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v65, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v54, 16, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v64, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v55, 16, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v53, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v50, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v52, 16, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v66, 16, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v49, 16, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v35, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v34, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v37, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <52 x i16> %a, splat (i16 3) + %a2 = bitcast <52 x i16> %a1 to <52 x half> + br label %end + +cmp.false: + %a3 = bitcast <52 x i16> %a to <52 x half> + br label %end + +end: + %phi = phi <52 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x half> %phi +} + define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v52f16_to_v52i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v20 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v24 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v30 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v63, v9 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; GCN-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; GCN-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; GCN-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; GCN-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; GCN-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; GCN-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v46, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v47, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v56, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v63 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v56 -; GCN-NEXT: v_or_b32_e32 v12, v8, v12 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v9 -; GCN-NEXT: v_or_b32_e32 v13, v13, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v11 -; GCN-NEXT: v_or_b32_e32 v14, v14, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v10 -; GCN-NEXT: v_or_b32_e32 v15, v15, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v2 -; GCN-NEXT: v_or_b32_e32 v16, v16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v1 -; GCN-NEXT: v_or_b32_e32 v41, v41, v22 -; GCN-NEXT: v_or_b32_e32 v40, v40, v36 -; GCN-NEXT: v_or_b32_e32 v54, v54, v42 -; GCN-NEXT: v_or_b32_e32 v52, v52, v55 -; GCN-NEXT: v_or_b32_e32 v8, v50, v53 -; GCN-NEXT: v_or_b32_e32 v20, v20, v51 -; GCN-NEXT: v_or_b32_e32 v21, v21, v49 -; GCN-NEXT: v_or_b32_e32 v29, v29, v34 -; GCN-NEXT: v_or_b32_e32 v27, v27, v30 -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: v_or_b32_e32 v39, v23, v26 -; GCN-NEXT: v_or_b32_e32 v35, v32, v24 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_or_b32_e32 v56, v31, v33 -; GCN-NEXT: v_or_b32_e32 v18, v18, v46 -; GCN-NEXT: v_or_b32_e32 v17, v17, v47 -; GCN-NEXT: v_or_b32_e32 v19, v19, v58 -; GCN-NEXT: v_or_b32_e32 v7, v7, v59 -; GCN-NEXT: v_or_b32_e32 v6, v6, v60 -; GCN-NEXT: v_or_b32_e32 v5, v5, v62 -; GCN-NEXT: v_or_b32_e32 v4, v4, v43 -; GCN-NEXT: v_or_b32_e32 v3, v3, v44 -; GCN-NEXT: v_alignbit_b32 v44, v3, v22, 16 -; GCN-NEXT: v_alignbit_b32 v43, v4, v36, 16 -; GCN-NEXT: v_alignbit_b32 v42, v5, v42, 16 -; GCN-NEXT: v_alignbit_b32 v55, v6, v55, 16 -; GCN-NEXT: v_alignbit_b32 v53, v7, v53, 16 -; GCN-NEXT: v_alignbit_b32 v51, v19, v51, 16 -; GCN-NEXT: v_alignbit_b32 v22, v17, v49, 16 -; GCN-NEXT: v_alignbit_b32 v23, v18, v34, 16 -; GCN-NEXT: v_alignbit_b32 v30, v16, v30, 16 -; GCN-NEXT: v_alignbit_b32 v28, v15, v28, 16 -; GCN-NEXT: v_alignbit_b32 v26, v14, v26, 16 -; GCN-NEXT: v_alignbit_b32 v24, v13, v24, 16 -; GCN-NEXT: v_alignbit_b32 v36, v12, v33, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v44 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v43 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v52, 0xffff, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v49, v49, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v40, v40, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_or_b32_e32 v2, v4, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v54, v54, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v63 -; GCN-NEXT: v_or_b32_e32 v5, v5, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v52, v52, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v6, v6, v11 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v8, v8, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v20, v20, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v19, v19, v47 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v21, v21, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v61 -; GCN-NEXT: v_or_b32_e32 v17, v17, v58 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v23, v29, v23 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; GCN-NEXT: v_or_b32_e32 v18, v18, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_or_b32_e32 v16, v16, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x64, v0 -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: v_or_b32_e32 v15, v15, v37 -; GCN-NEXT: v_or_b32_e32 v26, v39, v26 -; GCN-NEXT: v_or_b32_e32 v14, v14, v59 -; GCN-NEXT: v_or_b32_e32 v24, v35, v24 -; GCN-NEXT: v_or_b32_e32 v13, v13, v57 -; GCN-NEXT: v_or_b32_e32 v28, v56, v36 -; GCN-NEXT: v_or_b32_e32 v12, v12, v45 -; GCN-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v52f16_to_v52i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:88 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v30 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v36 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v38 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v48 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v53 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v54 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v32 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v34 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v37 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v39 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_mov_b32_e32 v37, v22 +; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: v_mov_b32_e32 v39, v3 +; SI-NEXT: v_mov_b32_e32 v49, v5 +; SI-NEXT: v_mov_b32_e32 v60, v7 +; SI-NEXT: v_mov_b32_e32 v62, v8 +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_or_b32_e32 v2, v34, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v33, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v42 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v34 +; SI-NEXT: v_or_b32_e32 v62, v35, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v41 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v60 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v49 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v35 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_or_b32_e32 v60, v34, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v40 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v49, v33, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v55 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v29 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v37 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v39 +; SI-NEXT: v_or_b32_e32 v22, v22, v29 +; SI-NEXT: v_or_b32_e32 v37, v33, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v59 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v58 +; SI-NEXT: v_or_b32_e32 v59, v28, v26 +; SI-NEXT: v_or_b32_e32 v39, v35, v55 +; SI-NEXT: v_or_b32_e32 v30, v30, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 +; SI-NEXT: v_or_b32_e32 v58, v28, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v28 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_or_b32_e32 v31, v25, v57 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v32 +; SI-NEXT: v_or_b32_e32 v54, v5, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v48 +; SI-NEXT: v_or_b32_e32 v53, v3, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v63 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v61 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_or_b32_e32 v12, v12, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_or_b32_e32 v18, v18, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_or_b32_e32 v4, v4, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_or_b32_e32 v6, v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v50 +; SI-NEXT: v_or_b32_e32 v36, v33, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v51 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_or_b32_e32 v52, v25, v33 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v50 +; SI-NEXT: v_or_b32_e32 v51, v28, v25 +; SI-NEXT: v_alignbit_b32 v45, v51, v38, 16 +; SI-NEXT: v_alignbit_b32 v44, v52, v44, 16 +; SI-NEXT: v_alignbit_b32 v43, v6, v42, 16 +; SI-NEXT: v_alignbit_b32 v42, v4, v41, 16 +; SI-NEXT: v_alignbit_b32 v41, v15, v46, 16 +; SI-NEXT: v_alignbit_b32 v40, v21, v55, 16 +; SI-NEXT: v_alignbit_b32 v55, v23, v29, 16 +; SI-NEXT: v_alignbit_b32 v29, v18, v47, 16 +; SI-NEXT: v_alignbit_b32 v28, v12, v27, 16 +; SI-NEXT: v_alignbit_b32 v27, v14, v26, 16 +; SI-NEXT: v_alignbit_b32 v26, v9, v56, 16 +; SI-NEXT: v_alignbit_b32 v25, v53, v24, 16 +; SI-NEXT: v_alignbit_b32 v24, v54, v57, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v33, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v33, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v50 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_add_i32_e32 v34, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v44 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v33, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_add_i32_e32 v34, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v33, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_add_i32_e32 v33, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52f16_to_v52i16: ; VI: ; %bb.0: @@ -21369,7 +44638,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_add_f16_e32 v44, 0x200, v44 @@ -21423,7 +44692,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 ; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 ; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v44 ; VI-NEXT: v_or_b32_sdwa v0, v0, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -21523,7 +44792,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v25, v44, v25, s6 @@ -21605,7 +44874,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v25 -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v21, v40, v21, s4 @@ -21650,7 +44919,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] @@ -21678,7 +44947,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -21716,7 +44985,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v25, v68, v25, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v24, v67, v24, 0x5040100 @@ -21796,7 +45065,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v25 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: .LBB58_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v27, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v28, v1, 0x5040100 @@ -21841,3 +45110,1387 @@ end: %phi = phi <52 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <52 x i16> %phi } + +define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52f16_to_v52i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s29 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v43 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v47 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v56 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v43 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v42 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v53 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v53 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v3, v3, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_or_b32_e32 v9, v9, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_or_b32_e32 v14, v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_or_b32_e32 v12, v12, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_or_b32_e32 v17, v17, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v31 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_or_b32_e32 v36, v36, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v32 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v34, v34, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v48, v29, v48 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v50, v50, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_or_b32_e32 v19, v19, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v43 +; SI-NEXT: v_or_b32_e32 v26, v26, v45 +; SI-NEXT: v_or_b32_e32 v21, v21, v30 +; SI-NEXT: v_or_b32_e32 v20, v20, v41 +; SI-NEXT: v_or_b32_e32 v49, v49, v46 +; SI-NEXT: v_or_b32_e32 v37, v37, v55 +; SI-NEXT: v_or_b32_e32 v35, v35, v54 +; SI-NEXT: v_or_b32_e32 v33, v33, v47 +; SI-NEXT: v_or_b32_e32 v15, v15, v52 +; SI-NEXT: v_or_b32_e32 v13, v13, v51 +; SI-NEXT: v_or_b32_e32 v11, v11, v56 +; SI-NEXT: v_or_b32_e32 v6, v6, v28 +; SI-NEXT: v_or_b32_e32 v4, v4, v57 +; SI-NEXT: v_alignbit_b32 v44, v24, v43, 16 +; SI-NEXT: v_alignbit_b32 v43, v25, v45, 16 +; SI-NEXT: v_alignbit_b32 v42, v19, v30, 16 +; SI-NEXT: v_alignbit_b32 v30, v50, v41, 16 +; SI-NEXT: v_alignbit_b32 v41, v48, v46, 16 +; SI-NEXT: v_alignbit_b32 v40, v34, v55, 16 +; SI-NEXT: v_alignbit_b32 v55, v36, v54, 16 +; SI-NEXT: v_alignbit_b32 v54, v17, v47, 16 +; SI-NEXT: v_alignbit_b32 v53, v12, v52, 16 +; SI-NEXT: v_alignbit_b32 v52, v14, v51, 16 +; SI-NEXT: v_alignbit_b32 v51, v9, v56, 16 +; SI-NEXT: v_alignbit_b32 v29, v3, v28, 16 +; SI-NEXT: v_alignbit_b32 v28, v5, v57, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v27, v27, v44 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v43 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v42 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v39 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v40 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v55 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v31 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v54 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v52f16_to_v52i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v38, 0x200 +; VI-NEXT: v_add_f16_e32 v36, s16, v38 +; VI-NEXT: v_add_f16_e32 v43, s43, v38 +; VI-NEXT: v_add_f16_e32 v37, s17, v38 +; VI-NEXT: v_add_f16_e32 v42, s42, v38 +; VI-NEXT: v_add_f16_e32 v26, s18, v38 +; VI-NEXT: v_add_f16_e32 v41, s41, v38 +; VI-NEXT: v_add_f16_e32 v27, s19, v38 +; VI-NEXT: v_add_f16_e32 v40, s40, v38 +; VI-NEXT: v_add_f16_e32 v28, s20, v38 +; VI-NEXT: v_add_f16_e32 v55, s15, v38 +; VI-NEXT: v_add_f16_e32 v29, s21, v38 +; VI-NEXT: v_add_f16_e32 v54, s14, v38 +; VI-NEXT: v_add_f16_e32 v30, s22, v38 +; VI-NEXT: v_add_f16_e32 v53, s13, v38 +; VI-NEXT: v_add_f16_e32 v31, s23, v38 +; VI-NEXT: v_add_f16_e32 v52, s12, v38 +; VI-NEXT: v_add_f16_e32 v32, s24, v38 +; VI-NEXT: v_add_f16_e32 v51, s11, v38 +; VI-NEXT: v_add_f16_e32 v33, s25, v38 +; VI-NEXT: v_add_f16_e32 v50, s10, v38 +; VI-NEXT: v_add_f16_e32 v34, s26, v38 +; VI-NEXT: v_add_f16_e32 v49, s9, v38 +; VI-NEXT: v_add_f16_e32 v35, s27, v38 +; VI-NEXT: v_add_f16_e32 v48, s8, v38 +; VI-NEXT: v_add_f16_e32 v12, s28, v38 +; VI-NEXT: v_add_f16_e32 v39, s7, v38 +; VI-NEXT: v_add_f16_e32 v13, s29, v38 +; VI-NEXT: v_add_f16_e32 v38, s6, v38 +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: s_branch .LBB59_5 +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v38, s6 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v39, s7 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v48, s8 +; VI-NEXT: v_mov_b32_e32 v35, s27 +; VI-NEXT: v_mov_b32_e32 v49, s9 +; VI-NEXT: v_mov_b32_e32 v34, s26 +; VI-NEXT: v_mov_b32_e32 v50, s10 +; VI-NEXT: v_mov_b32_e32 v33, s25 +; VI-NEXT: v_mov_b32_e32 v51, s11 +; VI-NEXT: v_mov_b32_e32 v32, s24 +; VI-NEXT: v_mov_b32_e32 v52, s12 +; VI-NEXT: v_mov_b32_e32 v31, s23 +; VI-NEXT: v_mov_b32_e32 v53, s13 +; VI-NEXT: v_mov_b32_e32 v30, s22 +; VI-NEXT: v_mov_b32_e32 v54, s14 +; VI-NEXT: v_mov_b32_e32 v29, s21 +; VI-NEXT: v_mov_b32_e32 v55, s15 +; VI-NEXT: v_mov_b32_e32 v28, s20 +; VI-NEXT: v_mov_b32_e32 v40, s40 +; VI-NEXT: v_mov_b32_e32 v27, s19 +; VI-NEXT: v_mov_b32_e32 v41, s41 +; VI-NEXT: v_mov_b32_e32 v26, s18 +; VI-NEXT: v_mov_b32_e32 v42, s42 +; VI-NEXT: v_mov_b32_e32 v37, s17 +; VI-NEXT: v_mov_b32_e32 v43, s43 +; VI-NEXT: v_mov_b32_e32 v36, s16 +; VI-NEXT: .LBB59_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v36, v36, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v37, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: v_or_b32_sdwa v28, v28, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v30, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v32, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v34, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v35, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; VI-NEXT: v_or_b32_sdwa v12, v12, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v36 +; VI-NEXT: v_mov_b32_e32 v1, v37 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v30 +; VI-NEXT: v_mov_b32_e32 v7, v31 +; VI-NEXT: v_mov_b32_e32 v8, v32 +; VI-NEXT: v_mov_b32_e32 v9, v33 +; VI-NEXT: v_mov_b32_e32 v10, v34 +; VI-NEXT: v_mov_b32_e32 v11, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v52f16_to_v52i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v11, v25, 16, v11 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v10, v24, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v23, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v8, v22, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_mov_b32_e32 v14, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_f16 v12, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_f16 v35, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_f16 v34, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_f16 v33, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_f16 v32, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_f16 v31, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_f16 v30, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_f16 v29, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_f16 v28, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_f16 v27, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_pk_add_f16 v26, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_pk_add_f16 v37, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_f16 v36, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: s_branch .LBB59_5 +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v35, s27 +; GFX9-NEXT: v_mov_b32_e32 v34, s26 +; GFX9-NEXT: v_mov_b32_e32 v33, s25 +; GFX9-NEXT: v_mov_b32_e32 v32, s24 +; GFX9-NEXT: v_mov_b32_e32 v31, s23 +; GFX9-NEXT: v_mov_b32_e32 v30, s22 +; GFX9-NEXT: v_mov_b32_e32 v29, s21 +; GFX9-NEXT: v_mov_b32_e32 v28, s20 +; GFX9-NEXT: v_mov_b32_e32 v27, s19 +; GFX9-NEXT: v_mov_b32_e32 v26, s18 +; GFX9-NEXT: v_mov_b32_e32 v37, s17 +; GFX9-NEXT: v_mov_b32_e32 v36, s16 +; GFX9-NEXT: v_mov_b32_e32 v38, s43 +; GFX9-NEXT: v_mov_b32_e32 v39, s42 +; GFX9-NEXT: v_mov_b32_e32 v48, s41 +; GFX9-NEXT: v_mov_b32_e32 v49, s40 +; GFX9-NEXT: v_mov_b32_e32 v50, s15 +; GFX9-NEXT: v_mov_b32_e32 v51, s14 +; GFX9-NEXT: v_mov_b32_e32 v52, s13 +; GFX9-NEXT: v_mov_b32_e32 v53, s12 +; GFX9-NEXT: v_mov_b32_e32 v54, s11 +; GFX9-NEXT: v_mov_b32_e32 v55, s10 +; GFX9-NEXT: v_mov_b32_e32 v40, s9 +; GFX9-NEXT: v_mov_b32_e32 v41, s8 +; GFX9-NEXT: v_mov_b32_e32 v42, s7 +; GFX9-NEXT: v_mov_b32_e32 v43, s6 +; GFX9-NEXT: .LBB59_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_lshl_or_b32 v36, v43, 16, v36 +; GFX9-NEXT: v_lshl_or_b32 v37, v42, 16, v37 +; GFX9-NEXT: v_lshl_or_b32 v26, v41, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v40, 16, v27 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v28, v55, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v54, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v30, v53, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v31, v52, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v32, v51, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v33, v50, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v34, v49, 16, v34 +; GFX9-NEXT: v_lshl_or_b32 v35, v48, 16, v35 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v38, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v36 +; GFX9-NEXT: v_mov_b32_e32 v1, v37 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v30 +; GFX9-NEXT: v_mov_b32_e32 v7, v31 +; GFX9-NEXT: v_mov_b32_e32 v8, v32 +; GFX9-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-NEXT: v_mov_b32_e32 v10, v34 +; GFX9-NEXT: v_mov_b32_e32 v11, v35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v52i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v8, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v10, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v8, 16, v3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v9, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v10, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v11, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s15, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s14, s11 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s12, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s11, s7 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v52.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v53.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v65.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v67.l +; GFX11-TRUE16-NEXT: s_branch .LBB59_5 +; GFX11-TRUE16-NEXT: .LBB59_3: +; GFX11-TRUE16-NEXT: s_branch .LBB59_2 +; GFX11-TRUE16-NEXT: .LBB59_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s10 +; GFX11-TRUE16-NEXT: .LBB59_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v25, 16, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v51, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v36, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v23, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v24, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v38, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v34, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v35, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v48, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v19, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v23, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v50, 16, v32 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v39, 16, v28 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v49, 16, v52 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v37, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v52i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v30, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v31, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v33, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: s_branch .LBB59_5 +; GFX11-FAKE16-NEXT: .LBB59_3: +; GFX11-FAKE16-NEXT: s_branch .LBB59_2 +; GFX11-FAKE16-NEXT: .LBB59_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s29 :: v_dual_mov_b32 v17, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s27 :: v_dual_mov_b32 v12, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s23 :: v_dual_mov_b32 v8, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s19 :: v_dual_mov_b32 v31, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s17 :: v_dual_mov_b32 v27, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s3 :: v_dual_mov_b32 v29, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v33, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s43 :: v_dual_mov_b32 v37, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s41 :: v_dual_mov_b32 v39, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s15 :: v_dual_mov_b32 v49, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s13 :: v_dual_mov_b32 v51, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s11 :: v_dual_mov_b32 v53, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v55, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s6 :: v_dual_mov_b32 v65, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s5 +; GFX11-FAKE16-NEXT: .LBB59_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v67, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v65, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v54, 16, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v64, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v55, 16, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v53, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v50, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v52, 16, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v66, 16, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v49, 16, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v35, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v34, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v37, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <52 x half> %a, splat (half 0xH0200) + %a2 = bitcast <52 x half> %a1 to <52 x i16> + br label %end + +cmp.false: + %a3 = bitcast <52 x half> %a to <52 x i16> + br label %end + +end: + %phi = phi <52 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x i16> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index cdbe26b309831..5ed1db9e65839 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -1,52 +1,52 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <28 x float> @bitcast_v28i32_to_v28f32(<28 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i32_to_v28f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28i32_to_v28f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i32_to_v28f32: ; VI: ; %bb.0: @@ -188,47 +188,337 @@ end: ret <28 x float> %phi } +define inreg <28 x float> @bitcast_v28i32_to_v28f32_scalar(<28 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i32_to_v28f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v28i32_to_v28f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v28i32_to_v28f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v28i32_to_v28f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_3: +; GFX11-NEXT: .LBB1_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i32> %a, splat (i32 3) + %a2 = bitcast <28 x i32> %a1 to <28 x float> + br label %end + +cmp.false: + %a3 = bitcast <28 x i32> %a to <28 x float> + br label %end + +end: + %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x float> %phi +} + define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f32_to_v28i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f32_to_v28i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f32_to_v28i32: ; VI: ; %bb.0: @@ -237,7 +527,7 @@ define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 @@ -267,7 +557,7 @@ define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -278,7 +568,7 @@ define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 ; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 @@ -308,7 +598,7 @@ define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -320,7 +610,7 @@ define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 ; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 @@ -336,7 +626,7 @@ define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -356,47 +646,323 @@ end: ret <28 x i32> %phi } +define inreg <28 x i32> @bitcast_v28f32_to_v28i32_scalar(<28 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f32_to_v28i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v28f32_to_v28i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v28f32_to_v28i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v28f32_to_v28i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB3_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: .LBB3_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <28 x float> %a1 to <28 x i32> + br label %end + +cmp.false: + %a3 = bitcast <28 x float> %a to <28 x i32> + br label %end + +end: + %phi = phi <28 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i32> %phi +} + define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i32_to_v14i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28i32_to_v14i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i32_to_v14i64: ; VI: ; %bb.0: @@ -405,7 +971,7 @@ define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 @@ -435,7 +1001,7 @@ define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -446,7 +1012,7 @@ define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 @@ -476,7 +1042,7 @@ define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -488,7 +1054,7 @@ define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 ; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 @@ -518,7 +1084,7 @@ define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -538,71 +1104,361 @@ end: ret <14 x i64> %phi } -define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i64_to_v28i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <14 x i64> @bitcast_v28i32_to_v14i64_scalar(<28 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i32_to_v14i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 ; -; VI-LABEL: bitcast_v14i64_to_v28i32: +; VI-LABEL: bitcast_v28i32_to_v14i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v28i32_to_v14i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v28i32_to_v14i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_3: +; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i32> %a, splat (i32 3) + %a2 = bitcast <28 x i32> %a1 to <14 x i64> + br label %end + +cmp.false: + %a3 = bitcast <28 x i32> %a to <14 x i64> + br label %end + +end: + %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i64> %phi +} + +define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v14i64_to_v28i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14i64_to_v28i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 @@ -617,7 +1473,7 @@ define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -628,7 +1484,7 @@ define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 ; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc @@ -658,7 +1514,7 @@ define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -670,7 +1526,7 @@ define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -707,7 +1563,7 @@ define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -727,47 +1583,344 @@ end: ret <28 x i32> %phi } +define inreg <28 x i32> @bitcast_v14i64_to_v28i32_scalar(<14 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i64_to_v28i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v14i64_to_v28i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v14i64_to_v28i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v14i64_to_v28i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB7_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: .LBB7_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i64> %a, splat (i64 3) + %a2 = bitcast <14 x i64> %a1 to <28 x i32> + br label %end + +cmp.false: + %a3 = bitcast <14 x i64> %a to <28 x i32> + br label %end + +end: + %phi = phi <28 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i32> %phi +} + define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i32_to_v14f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28i32_to_v14f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i32_to_v14f64: ; VI: ; %bb.0: @@ -776,7 +1929,7 @@ define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 @@ -806,7 +1959,7 @@ define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -817,7 +1970,7 @@ define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 @@ -847,7 +2000,7 @@ define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -859,7 +2012,7 @@ define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 ; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 @@ -889,7 +2042,7 @@ define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -909,42 +2062,332 @@ end: ret <14 x double> %phi } -define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f64_to_v28i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <14 x double> @bitcast_v28i32_to_v14f64_scalar(<28 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i32_to_v14f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 ; -; VI-LABEL: bitcast_v14f64_to_v28i32: +; VI-LABEL: bitcast_v28i32_to_v14f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v28i32_to_v14f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v28i32_to_v14f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB9_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: .LBB9_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i32> %a, splat (i32 3) + %a2 = bitcast <28 x i32> %a1 to <14 x double> + br label %end + +cmp.false: + %a3 = bitcast <28 x i32> %a to <14 x double> + br label %end + +end: + %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x double> %phi +} + +define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v14f64_to_v28i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f64_to_v28i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -960,7 +2403,7 @@ define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -971,7 +2414,7 @@ define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -987,7 +2430,7 @@ define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -999,7 +2442,7 @@ define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -1015,7 +2458,7 @@ define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1035,312 +2478,569 @@ end: ret <28 x i32> %phi } +define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f64_to_v28i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v29, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: v_mov_b32_e32 v15, v29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v14f64_to_v28i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v29, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: v_mov_b32_e32 v15, v29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v14f64_to_v28i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v29, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: v_mov_b32_e32 v15, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v14f64_to_v28i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB11_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: .LBB11_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <14 x double> %a1 to <28 x i32> + br label %end + +cmp.false: + %a3 = bitcast <14 x double> %a to <28 x i32> + br label %end + +end: + %phi = phi <28 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i32> %phi +} + define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i32_to_v56i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v35, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v48, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v50, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v52, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v41, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v43, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v35, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v48, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v50, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v52, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v41, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v43, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_and_b32_e32 v60, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_and_b32_e32 v61, 0xffff, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v41 -; GCN-NEXT: v_or_b32_e32 v5, v57, v45 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v41, v58, v56 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v59, v43 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v43, v60, v47 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v47, v61, v62 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v45, v45, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v56, v56, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v44, v58, v44 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v9, v9, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v10, v10, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v11, v11, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v12, v12, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v13, v13, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v14, v14, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v15, v15, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v16, v16, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v17, v17, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v18, v18, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v19, v19, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v20, v20, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v21, v21, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v22, v22, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v23, v23, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v24, v24, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x6c, v0 -; GCN-NEXT: v_or_b32_e32 v25, v25, v30 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v29 -; GCN-NEXT: v_or_b32_e32 v28, v28, v34 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28i32_to_v56i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v39, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v49, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v54, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v39, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v49, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v54, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i32_to_v56i16: ; VI: ; %bb.0: @@ -1384,7 +3084,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -1414,9 +3114,9 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB6_2: ; %Flow +; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_4 +; VI-NEXT: s_cbranch_execz .LBB12_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 @@ -1474,7 +3174,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB6_4: ; %end +; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 ; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 @@ -1585,7 +3285,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -1615,9 +3315,9 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB6_2: ; %Flow +; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 @@ -1675,7 +3375,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB6_4: ; %end +; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 @@ -1725,7 +3425,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 @@ -1755,7 +3455,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: .LBB12_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1794,7 +3494,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -1824,9 +3524,9 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 @@ -1884,7 +3584,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 @@ -1933,522 +3633,1672 @@ end: ret <56 x i16> %phi } +define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i32_to_v56i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-NEXT: v_readfirstlane_b32 s43, v1 +; SI-NEXT: v_readfirstlane_b32 s42, v2 +; SI-NEXT: v_readfirstlane_b32 s41, v3 +; SI-NEXT: v_readfirstlane_b32 s40, v4 +; SI-NEXT: v_readfirstlane_b32 s15, v5 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v10 +; SI-NEXT: v_readfirstlane_b32 s9, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v12 +; SI-NEXT: v_readfirstlane_b32 s7, v13 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s22 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s29, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s27, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s25, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s23, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s21, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s19, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s17, v14, 16 +; SI-NEXT: s_lshr_b32 s44, s6, 16 +; SI-NEXT: s_lshr_b32 s45, s8, 16 +; SI-NEXT: s_lshr_b32 s46, s10, 16 +; SI-NEXT: s_lshr_b32 s47, s12, 16 +; SI-NEXT: s_lshr_b32 s56, s14, 16 +; SI-NEXT: s_lshr_b32 s57, s40, 16 +; SI-NEXT: s_lshr_b32 s58, s42, 16 +; SI-NEXT: s_lshr_b32 s59, s29, 16 +; SI-NEXT: s_lshr_b32 s60, s27, 16 +; SI-NEXT: s_lshr_b32 s61, s25, 16 +; SI-NEXT: s_lshr_b32 s62, s23, 16 +; SI-NEXT: s_lshr_b32 s63, s21, 16 +; SI-NEXT: s_lshr_b32 s72, s19, 16 +; SI-NEXT: s_lshr_b32 s73, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s22 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s29, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s27, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s25, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s23, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s21, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s19, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s17, v14, 16 +; SI-NEXT: s_lshr_b32 s44, s6, 16 +; SI-NEXT: s_lshr_b32 s45, s8, 16 +; SI-NEXT: s_lshr_b32 s46, s10, 16 +; SI-NEXT: s_lshr_b32 s47, s12, 16 +; SI-NEXT: s_lshr_b32 s56, s14, 16 +; SI-NEXT: s_lshr_b32 s57, s40, 16 +; SI-NEXT: s_lshr_b32 s58, s42, 16 +; SI-NEXT: s_lshr_b32 s59, s29, 16 +; SI-NEXT: s_lshr_b32 s60, s27, 16 +; SI-NEXT: s_lshr_b32 s61, s25, 16 +; SI-NEXT: s_lshr_b32 s62, s23, 16 +; SI-NEXT: s_lshr_b32 s63, s21, 16 +; SI-NEXT: s_lshr_b32 s72, s19, 16 +; SI-NEXT: s_lshr_b32 s73, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v13, s4, v13 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v14, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s42, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v28i32_to_v56i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v28, s30, 0 +; VI-NEXT: v_writelane_b32 v28, s31, 1 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_writelane_b32 v28, s34, 2 +; VI-NEXT: v_readfirstlane_b32 s43, v0 +; VI-NEXT: v_readfirstlane_b32 s42, v1 +; VI-NEXT: v_readfirstlane_b32 s41, v2 +; VI-NEXT: v_readfirstlane_b32 s40, v3 +; VI-NEXT: v_readfirstlane_b32 s15, v4 +; VI-NEXT: v_readfirstlane_b32 s14, v5 +; VI-NEXT: v_readfirstlane_b32 s13, v6 +; VI-NEXT: v_readfirstlane_b32 s12, v7 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s10, v9 +; VI-NEXT: v_readfirstlane_b32 s9, v10 +; VI-NEXT: v_readfirstlane_b32 s8, v11 +; VI-NEXT: v_readfirstlane_b32 s6, v12 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v13 +; VI-NEXT: v_writelane_b32 v28, s35, 3 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s44, s7, 16 +; VI-NEXT: s_lshr_b32 s45, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: s_lshr_b32 s56, s10, 16 +; VI-NEXT: s_lshr_b32 s57, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s12, 16 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 16 +; VI-NEXT: s_lshr_b32 s61, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s40, 16 +; VI-NEXT: s_lshr_b32 s63, s41, 16 +; VI-NEXT: s_lshr_b32 s72, s42, 16 +; VI-NEXT: s_lshr_b32 s73, s43, 16 +; VI-NEXT: s_lshr_b32 s74, s29, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s25, 16 +; VI-NEXT: s_lshr_b32 s79, s24, 16 +; VI-NEXT: s_lshr_b32 s88, s23, 16 +; VI-NEXT: s_lshr_b32 s89, s22, 16 +; VI-NEXT: s_lshr_b32 s90, s21, 16 +; VI-NEXT: s_lshr_b32 s91, s20, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 16 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s44, s7, 16 +; VI-NEXT: s_lshr_b32 s45, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: s_lshr_b32 s56, s10, 16 +; VI-NEXT: s_lshr_b32 s57, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s12, 16 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 16 +; VI-NEXT: s_lshr_b32 s61, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s40, 16 +; VI-NEXT: s_lshr_b32 s63, s41, 16 +; VI-NEXT: s_lshr_b32 s72, s42, 16 +; VI-NEXT: s_lshr_b32 s73, s43, 16 +; VI-NEXT: s_lshr_b32 s74, s29, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s25, 16 +; VI-NEXT: s_lshr_b32 s79, s24, 16 +; VI-NEXT: s_lshr_b32 s88, s23, 16 +; VI-NEXT: s_lshr_b32 s89, s22, 16 +; VI-NEXT: s_lshr_b32 s90, s21, 16 +; VI-NEXT: s_lshr_b32 s91, s20, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 16 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s35, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s34, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s31, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s30, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s91, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s90, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s89, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s88, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s79, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s78, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s77, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s76, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s75, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s74, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s29, s73, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s42, s72, 16 +; VI-NEXT: s_or_b32 s29, s29, s42 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s42, s63, 16 +; VI-NEXT: s_or_b32 s41, s41, s42 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s42, s62, 16 +; VI-NEXT: s_or_b32 s40, s40, s42 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s42, s61, 16 +; VI-NEXT: s_or_b32 s15, s15, s42 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s42, s60, 16 +; VI-NEXT: s_or_b32 s14, s14, s42 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s42, s59, 16 +; VI-NEXT: s_or_b32 s13, s13, s42 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s42, s58, 16 +; VI-NEXT: s_or_b32 s12, s12, s42 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s42, s57, 16 +; VI-NEXT: s_or_b32 s11, s11, s42 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s42, s56, 16 +; VI-NEXT: s_or_b32 s10, s10, s42 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s42, s47, 16 +; VI-NEXT: s_or_b32 s9, s9, s42 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s42, s46, 16 +; VI-NEXT: s_or_b32 s8, s8, s42 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s42, s45, 16 +; VI-NEXT: s_or_b32 s6, s6, s42 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s42, s44, 16 +; VI-NEXT: s_or_b32 s7, s7, s42 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s41 +; VI-NEXT: v_mov_b32_e32 v17, s40 +; VI-NEXT: v_mov_b32_e32 v18, s15 +; VI-NEXT: v_mov_b32_e32 v19, s14 +; VI-NEXT: v_mov_b32_e32 v20, s13 +; VI-NEXT: v_mov_b32_e32 v21, s12 +; VI-NEXT: v_mov_b32_e32 v22, s11 +; VI-NEXT: v_mov_b32_e32 v23, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v25, s8 +; VI-NEXT: v_mov_b32_e32 v26, s6 +; VI-NEXT: v_mov_b32_e32 v27, s7 +; VI-NEXT: v_readlane_b32 s35, v28, 3 +; VI-NEXT: v_readlane_b32 s34, v28, 2 +; VI-NEXT: v_readlane_b32 s31, v28, 1 +; VI-NEXT: v_readlane_b32 s30, v28, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v28i32_to_v56i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: v_readfirstlane_b32 s42, v12 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s43, v13 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s44, s43, 16 +; GFX9-NEXT: s_lshr_b32 s45, s42, 16 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_lshr_b32 s47, s40, 16 +; GFX9-NEXT: s_lshr_b32 s56, s15, 16 +; GFX9-NEXT: s_lshr_b32 s57, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s13, 16 +; GFX9-NEXT: s_lshr_b32 s59, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s11, 16 +; GFX9-NEXT: s_lshr_b32 s61, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s9, 16 +; GFX9-NEXT: s_lshr_b32 s63, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s28, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s25, 16 +; GFX9-NEXT: s_lshr_b32 s79, s24, 16 +; GFX9-NEXT: s_lshr_b32 s88, s23, 16 +; GFX9-NEXT: s_lshr_b32 s89, s22, 16 +; GFX9-NEXT: s_lshr_b32 s90, s21, 16 +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 16 +; GFX9-NEXT: s_lshr_b32 s93, s18, 16 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s43, s43, 3 +; GFX9-NEXT: s_add_i32 s42, s42, 3 +; GFX9-NEXT: s_add_i32 s41, s41, 3 +; GFX9-NEXT: s_add_i32 s40, s40, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s44, s43, 16 +; GFX9-NEXT: s_lshr_b32 s45, s42, 16 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_lshr_b32 s47, s40, 16 +; GFX9-NEXT: s_lshr_b32 s56, s15, 16 +; GFX9-NEXT: s_lshr_b32 s57, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s13, 16 +; GFX9-NEXT: s_lshr_b32 s59, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s11, 16 +; GFX9-NEXT: s_lshr_b32 s61, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s9, 16 +; GFX9-NEXT: s_lshr_b32 s63, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s28, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s25, 16 +; GFX9-NEXT: s_lshr_b32 s79, s24, 16 +; GFX9-NEXT: s_lshr_b32 s88, s23, 16 +; GFX9-NEXT: s_lshr_b32 s89, s22, 16 +; GFX9-NEXT: s_lshr_b32 s90, s21, 16 +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 16 +; GFX9-NEXT: s_lshr_b32 s93, s18, 16 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s44 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-TRUE16-LABEL: bitcast_v28i32_to_v56i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-TRUE16-NEXT: s_mov_b32 s90, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s13, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s12, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s13 :: v_dual_mov_b32 v19, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s11 :: v_dual_mov_b32 v21, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s7 :: v_dual_mov_b32 v25, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v27, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 +; +; GFX11-FAKE16-LABEL: bitcast_v28i32_to_v56i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v9 +; GFX11-FAKE16-NEXT: s_mov_b32 s90, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s13, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s12, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB13_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i32> %a, splat (i32 3) + %a2 = bitcast <28 x i32> %a1 to <56 x i16> + br label %end + +cmp.false: + %a3 = bitcast <28 x i32> %a to <56 x i16> + br label %end + +end: + %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x i16> %phi +} + define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v56i16_to_v28i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v41 -; GCN-NEXT: v_or_b32_e32 v1, v1, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v63 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v62 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v61 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v60 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v59 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v58 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v56 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v47 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v11, v11, v46 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v44 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v43 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v57 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v28 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: .LBB7_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v57 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v0, v41, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_or_b32_e32 v2, v63, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v62, v4 -; GCN-NEXT: v_or_b32_e32 v5, v61, v5 -; GCN-NEXT: v_or_b32_e32 v6, v60, v6 -; GCN-NEXT: v_or_b32_e32 v7, v59, v7 -; GCN-NEXT: v_or_b32_e32 v8, v58, v8 -; GCN-NEXT: v_or_b32_e32 v9, v56, v9 -; GCN-NEXT: v_or_b32_e32 v10, v47, v10 -; GCN-NEXT: v_or_b32_e32 v11, v46, v11 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v28, v12 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v28, v13 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v28, v14 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v28, v16 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v28, v17 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v28, v18 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v28, v19 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v28, v20 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v28, v21 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v28, v22 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v28, v23 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v28, v24 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v28, v25 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v28, v26 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 -; GCN-NEXT: .LBB7_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v56i16_to_v28i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v59 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_or_b32_e32 v2, v2, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v49 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v3, v3, v57 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v6, v6, v46 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v45 +; SI-NEXT: v_or_b32_e32 v9, v9, v44 +; SI-NEXT: v_or_b32_e32 v10, v10, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_or_b32_e32 v13, v13, v63 +; SI-NEXT: v_or_b32_e32 v14, v14, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v56 +; SI-NEXT: v_or_b32_e32 v18, v18, v47 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v20, v20, v36 +; SI-NEXT: v_or_b32_e32 v21, v21, v35 +; SI-NEXT: v_or_b32_e32 v22, v22, v43 +; SI-NEXT: v_or_b32_e32 v23, v23, v42 +; SI-NEXT: v_or_b32_e32 v24, v24, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v40 +; SI-NEXT: v_or_b32_e32 v26, v26, v62 +; SI-NEXT: v_or_b32_e32 v27, v27, v60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v16, v49, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v3, v57, v3 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: v_or_b32_e32 v9, v44, v9 +; SI-NEXT: v_or_b32_e32 v10, v34, v10 +; SI-NEXT: v_or_b32_e32 v11, v33, v11 +; SI-NEXT: v_or_b32_e32 v12, v32, v12 +; SI-NEXT: v_or_b32_e32 v13, v63, v13 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: v_or_b32_e32 v15, v50, v15 +; SI-NEXT: v_or_b32_e32 v17, v56, v17 +; SI-NEXT: v_or_b32_e32 v18, v47, v18 +; SI-NEXT: v_or_b32_e32 v19, v38, v19 +; SI-NEXT: v_or_b32_e32 v20, v36, v20 +; SI-NEXT: v_or_b32_e32 v21, v35, v21 +; SI-NEXT: v_or_b32_e32 v22, v43, v22 +; SI-NEXT: v_or_b32_e32 v23, v42, v23 +; SI-NEXT: v_or_b32_e32 v24, v41, v24 +; SI-NEXT: v_or_b32_e32 v25, v40, v25 +; SI-NEXT: v_or_b32_e32 v26, v62, v26 +; SI-NEXT: v_or_b32_e32 v27, v60, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v56i16_to_v28i32: ; VI: ; %bb.0: @@ -2497,7 +5347,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v27, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2584,9 +5434,9 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_4 +; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v27, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v59 @@ -2673,7 +5523,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v28, 3, v32 ; VI-NEXT: v_add_u16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: .LBB7_4: ; %end +; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2788,7 +5638,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload @@ -2928,9 +5778,9 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload @@ -3032,7 +5882,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 ; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_4: ; %end +; GFX9-NEXT: .LBB14_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -3061,7 +5911,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -3091,7 +5941,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3159,7 +6009,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -3189,7 +6039,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3209,640 +6059,1860 @@ end: ret <28 x i32> %phi } -define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i32_to_v56f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v40 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v44 -; GCN-NEXT: v_mov_b32_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v46 -; GCN-NEXT: v_mov_b32_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v56 -; GCN-NEXT: v_mov_b32_e32 v56, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v58 -; GCN-NEXT: v_mov_b32_e32 v58, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v60 -; GCN-NEXT: v_mov_b32_e32 v60, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v62 -; GCN-NEXT: v_mov_b32_e32 v62, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_mov_b32_e32 v43, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v45 -; GCN-NEXT: v_mov_b32_e32 v45, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v47 -; GCN-NEXT: v_mov_b32_e32 v47, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v57 -; GCN-NEXT: v_mov_b32_e32 v57, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v59 -; GCN-NEXT: v_mov_b32_e32 v59, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v61 -; GCN-NEXT: v_mov_b32_e32 v61, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v63 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v39 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v37 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v5, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v7, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v49 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v48 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v38 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v36 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v34 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v21, v21, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v32 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v23, v23, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v25, v25, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v61 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v59 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v57 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v47 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v45 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v43 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v42 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v44 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v46 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v56 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v58 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x68, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v60 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x6c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v62 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_or_b32_e32 v45, v47, v46 -; GCN-NEXT: v_or_b32_e32 v46, v57, v56 -; GCN-NEXT: v_or_b32_e32 v47, v59, v58 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56i16_to_v28i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 +; SI-NEXT: v_mov_b32_e32 v32, v26 +; SI-NEXT: v_mov_b32_e32 v33, v24 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v12 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v12, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v13, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v14, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v16, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v17, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v8, v1, v18 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v27, v0, v63 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v41 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mov_b32_e32 v41, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: v_mov_b32_e32 v42, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v60 +; SI-NEXT: v_mov_b32_e32 v60, v57 +; SI-NEXT: v_mov_b32_e32 v57, v46 +; SI-NEXT: v_mov_b32_e32 v46, v61 +; SI-NEXT: v_mov_b32_e32 v61, v58 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v31, v47 +; SI-NEXT: v_mov_b32_e32 v47, v58 +; SI-NEXT: v_mov_b32_e32 v58, v61 +; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v46, v57 +; SI-NEXT: v_mov_b32_e32 v57, v60 +; SI-NEXT: v_mov_b32_e32 v60, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v48 +; SI-NEXT: v_mov_b32_e32 v48, v42 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v33, v36 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: v_mov_b32_e32 v56, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB15_2 ; -; VI-LABEL: bitcast_v28i32_to_v56f16: +; VI-LABEL: bitcast_v56i16_to_v28i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v39, v13 +; VI-NEXT: v_mov_b32_e32 v37, v12 +; VI-NEXT: v_mov_b32_e32 v35, v11 +; VI-NEXT: v_mov_b32_e32 v34, v10 +; VI-NEXT: v_mov_b32_e32 v33, v9 +; VI-NEXT: v_mov_b32_e32 v32, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v38, v6 +; VI-NEXT: v_mov_b32_e32 v48, v5 +; VI-NEXT: v_mov_b32_e32 v49, v4 +; VI-NEXT: v_mov_b32_e32 v50, v3 +; VI-NEXT: v_mov_b32_e32 v51, v2 +; VI-NEXT: v_mov_b32_e32 v53, v1 +; VI-NEXT: v_mov_b32_e32 v52, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: v_or_b32_sdwa v14, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_or_b32_sdwa v20, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v22, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v23, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v24, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v56i16_to_v28i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v13 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v34, v11 +; GFX9-NEXT: v_mov_b32_e32 v35, v10 +; GFX9-NEXT: v_mov_b32_e32 v36, v9 +; GFX9-NEXT: v_mov_b32_e32 v37, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v7 +; GFX9-NEXT: v_mov_b32_e32 v39, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v5 +; GFX9-NEXT: v_mov_b32_e32 v49, v4 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v51, v2 +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v27 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v33, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v7 :: v_dual_mov_b32 v35, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v5 :: v_dual_mov_b32 v37, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_mov_b32 v49, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <56 x i16> %a, splat (i16 3) + %a2 = bitcast <56 x i16> %a1 to <28 x i32> + br label %end + +cmp.false: + %a3 = bitcast <56 x i16> %a to <28 x i32> + br label %end + +end: + %phi = phi <28 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i32> %phi +} + +define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v28i32_to_v56f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_mov_b32_e32 v29, v28 +; SI-NEXT: v_mov_b32_e32 v57, v25 +; SI-NEXT: v_mov_b32_e32 v47, v26 +; SI-NEXT: v_mov_b32_e32 v45, v27 +; SI-NEXT: v_mov_b32_e32 v43, v1 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v28i32_to_v56f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill @@ -3879,7 +7949,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -3909,9 +7979,9 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB8_2: ; %Flow +; VI-NEXT: .LBB16_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_4 +; VI-NEXT: s_cbranch_execz .LBB16_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 @@ -3969,7 +8039,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB8_4: ; %end +; VI-NEXT: .LBB16_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 ; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 @@ -4080,7 +8150,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -4110,9 +8180,9 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB8_2: ; %Flow +; GFX9-NEXT: .LBB16_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: s_cbranch_execz .LBB16_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 @@ -4170,7 +8240,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB8_4: ; %end +; GFX9-NEXT: .LBB16_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 @@ -4220,7 +8290,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 @@ -4250,7 +8320,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4289,7 +8359,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -4319,9 +8389,9 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 @@ -4379,7 +8449,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 @@ -4428,704 +8498,2030 @@ end: ret <56 x half> %phi } +define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i32_to_v56f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-NEXT: v_readfirstlane_b32 s43, v1 +; SI-NEXT: v_readfirstlane_b32 s42, v2 +; SI-NEXT: v_readfirstlane_b32 s41, v3 +; SI-NEXT: v_readfirstlane_b32 s40, v4 +; SI-NEXT: v_readfirstlane_b32 s15, v5 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v10 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: v_readfirstlane_b32 s7, v12 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s44, s18, 16 +; SI-NEXT: s_lshr_b32 s45, s19, 16 +; SI-NEXT: s_lshr_b32 s46, s20, 16 +; SI-NEXT: s_lshr_b32 s47, s21, 16 +; SI-NEXT: s_lshr_b32 s56, s22, 16 +; SI-NEXT: s_lshr_b32 s57, s23, 16 +; SI-NEXT: s_lshr_b32 s58, s24, 16 +; SI-NEXT: s_lshr_b32 s59, s25, 16 +; SI-NEXT: s_lshr_b32 s60, s26, 16 +; SI-NEXT: s_lshr_b32 s61, s27, 16 +; SI-NEXT: s_lshr_b32 s62, s28, 16 +; SI-NEXT: s_lshr_b32 s63, s29, 16 +; SI-NEXT: s_lshr_b32 s72, s43, 16 +; SI-NEXT: s_lshr_b32 s73, s42, 16 +; SI-NEXT: s_lshr_b32 s74, s41, 16 +; SI-NEXT: s_lshr_b32 s75, s40, 16 +; SI-NEXT: s_lshr_b32 s76, s15, 16 +; SI-NEXT: s_lshr_b32 s77, s14, 16 +; SI-NEXT: s_lshr_b32 s78, s13, 16 +; SI-NEXT: s_lshr_b32 s79, s12, 16 +; SI-NEXT: s_lshr_b32 s88, s11, 16 +; SI-NEXT: s_lshr_b32 s89, s10, 16 +; SI-NEXT: s_lshr_b32 s90, s8, 16 +; SI-NEXT: s_lshr_b32 s91, s7, 16 +; SI-NEXT: s_lshr_b32 s92, s6, 16 +; SI-NEXT: s_lshr_b32 s93, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v45, s17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v47, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_or_b32_e32 v47, v47, v56 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: buffer_store_dword v47, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v47, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v45, v45, v46 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: buffer_store_dword v45, v47, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v45, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v43, v43, v44 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v43, v45, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v43, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v41, v41, v42 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v41, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v55, v40, v55 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v53, v54, v53 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v51, v52, v51 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v49, v50, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v32, v34, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v18, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v15, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v28i32_to_v56f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v28, s30, 0 +; VI-NEXT: v_writelane_b32 v28, s31, 1 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_writelane_b32 v28, s34, 2 +; VI-NEXT: v_readfirstlane_b32 s43, v0 +; VI-NEXT: v_readfirstlane_b32 s42, v1 +; VI-NEXT: v_readfirstlane_b32 s41, v2 +; VI-NEXT: v_readfirstlane_b32 s40, v3 +; VI-NEXT: v_readfirstlane_b32 s15, v4 +; VI-NEXT: v_readfirstlane_b32 s14, v5 +; VI-NEXT: v_readfirstlane_b32 s13, v6 +; VI-NEXT: v_readfirstlane_b32 s12, v7 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s10, v9 +; VI-NEXT: v_readfirstlane_b32 s9, v10 +; VI-NEXT: v_readfirstlane_b32 s8, v11 +; VI-NEXT: v_readfirstlane_b32 s6, v12 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v13 +; VI-NEXT: v_writelane_b32 v28, s35, 3 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s44, s7, 16 +; VI-NEXT: s_lshr_b32 s45, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: s_lshr_b32 s56, s10, 16 +; VI-NEXT: s_lshr_b32 s57, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s12, 16 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 16 +; VI-NEXT: s_lshr_b32 s61, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s40, 16 +; VI-NEXT: s_lshr_b32 s63, s41, 16 +; VI-NEXT: s_lshr_b32 s72, s42, 16 +; VI-NEXT: s_lshr_b32 s73, s43, 16 +; VI-NEXT: s_lshr_b32 s74, s29, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s25, 16 +; VI-NEXT: s_lshr_b32 s79, s24, 16 +; VI-NEXT: s_lshr_b32 s88, s23, 16 +; VI-NEXT: s_lshr_b32 s89, s22, 16 +; VI-NEXT: s_lshr_b32 s90, s21, 16 +; VI-NEXT: s_lshr_b32 s91, s20, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 16 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s44, s7, 16 +; VI-NEXT: s_lshr_b32 s45, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: s_lshr_b32 s56, s10, 16 +; VI-NEXT: s_lshr_b32 s57, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s12, 16 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 16 +; VI-NEXT: s_lshr_b32 s61, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s40, 16 +; VI-NEXT: s_lshr_b32 s63, s41, 16 +; VI-NEXT: s_lshr_b32 s72, s42, 16 +; VI-NEXT: s_lshr_b32 s73, s43, 16 +; VI-NEXT: s_lshr_b32 s74, s29, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s25, 16 +; VI-NEXT: s_lshr_b32 s79, s24, 16 +; VI-NEXT: s_lshr_b32 s88, s23, 16 +; VI-NEXT: s_lshr_b32 s89, s22, 16 +; VI-NEXT: s_lshr_b32 s90, s21, 16 +; VI-NEXT: s_lshr_b32 s91, s20, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 16 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s35, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s34, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s31, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s30, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s91, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s90, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s89, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s88, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s79, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s78, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s77, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s76, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s75, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s74, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s29, s73, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s42, s72, 16 +; VI-NEXT: s_or_b32 s29, s29, s42 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s42, s63, 16 +; VI-NEXT: s_or_b32 s41, s41, s42 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s42, s62, 16 +; VI-NEXT: s_or_b32 s40, s40, s42 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s42, s61, 16 +; VI-NEXT: s_or_b32 s15, s15, s42 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s42, s60, 16 +; VI-NEXT: s_or_b32 s14, s14, s42 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s42, s59, 16 +; VI-NEXT: s_or_b32 s13, s13, s42 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s42, s58, 16 +; VI-NEXT: s_or_b32 s12, s12, s42 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s42, s57, 16 +; VI-NEXT: s_or_b32 s11, s11, s42 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s42, s56, 16 +; VI-NEXT: s_or_b32 s10, s10, s42 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s42, s47, 16 +; VI-NEXT: s_or_b32 s9, s9, s42 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s42, s46, 16 +; VI-NEXT: s_or_b32 s8, s8, s42 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s42, s45, 16 +; VI-NEXT: s_or_b32 s6, s6, s42 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s42, s44, 16 +; VI-NEXT: s_or_b32 s7, s7, s42 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s41 +; VI-NEXT: v_mov_b32_e32 v17, s40 +; VI-NEXT: v_mov_b32_e32 v18, s15 +; VI-NEXT: v_mov_b32_e32 v19, s14 +; VI-NEXT: v_mov_b32_e32 v20, s13 +; VI-NEXT: v_mov_b32_e32 v21, s12 +; VI-NEXT: v_mov_b32_e32 v22, s11 +; VI-NEXT: v_mov_b32_e32 v23, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v25, s8 +; VI-NEXT: v_mov_b32_e32 v26, s6 +; VI-NEXT: v_mov_b32_e32 v27, s7 +; VI-NEXT: v_readlane_b32 s35, v28, 3 +; VI-NEXT: v_readlane_b32 s34, v28, 2 +; VI-NEXT: v_readlane_b32 s31, v28, 1 +; VI-NEXT: v_readlane_b32 s30, v28, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v28i32_to_v56f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: v_readfirstlane_b32 s42, v12 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s43, v13 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s44, s43, 16 +; GFX9-NEXT: s_lshr_b32 s45, s42, 16 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_lshr_b32 s47, s40, 16 +; GFX9-NEXT: s_lshr_b32 s56, s15, 16 +; GFX9-NEXT: s_lshr_b32 s57, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s13, 16 +; GFX9-NEXT: s_lshr_b32 s59, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s11, 16 +; GFX9-NEXT: s_lshr_b32 s61, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s9, 16 +; GFX9-NEXT: s_lshr_b32 s63, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s28, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s25, 16 +; GFX9-NEXT: s_lshr_b32 s79, s24, 16 +; GFX9-NEXT: s_lshr_b32 s88, s23, 16 +; GFX9-NEXT: s_lshr_b32 s89, s22, 16 +; GFX9-NEXT: s_lshr_b32 s90, s21, 16 +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 16 +; GFX9-NEXT: s_lshr_b32 s93, s18, 16 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s43, s43, 3 +; GFX9-NEXT: s_add_i32 s42, s42, 3 +; GFX9-NEXT: s_add_i32 s41, s41, 3 +; GFX9-NEXT: s_add_i32 s40, s40, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s44, s43, 16 +; GFX9-NEXT: s_lshr_b32 s45, s42, 16 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_lshr_b32 s47, s40, 16 +; GFX9-NEXT: s_lshr_b32 s56, s15, 16 +; GFX9-NEXT: s_lshr_b32 s57, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s13, 16 +; GFX9-NEXT: s_lshr_b32 s59, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s11, 16 +; GFX9-NEXT: s_lshr_b32 s61, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s9, 16 +; GFX9-NEXT: s_lshr_b32 s63, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s28, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s25, 16 +; GFX9-NEXT: s_lshr_b32 s79, s24, 16 +; GFX9-NEXT: s_lshr_b32 s88, s23, 16 +; GFX9-NEXT: s_lshr_b32 s89, s22, 16 +; GFX9-NEXT: s_lshr_b32 s90, s21, 16 +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 16 +; GFX9-NEXT: s_lshr_b32 s93, s18, 16 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s44 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-TRUE16-LABEL: bitcast_v28i32_to_v56f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-TRUE16-NEXT: s_mov_b32 s90, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-TRUE16-NEXT: .LBB17_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s13, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s12, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s13 :: v_dual_mov_b32 v19, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s11 :: v_dual_mov_b32 v21, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s7 :: v_dual_mov_b32 v25, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v27, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB17_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB17_2 +; +; GFX11-FAKE16-LABEL: bitcast_v28i32_to_v56f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v9 +; GFX11-FAKE16-NEXT: s_mov_b32 s90, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-FAKE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-FAKE16-NEXT: .LBB17_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s13, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s12, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB17_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i32> %a, splat (i32 3) + %a2 = bitcast <28 x i32> %a1 to <56 x half> + br label %end + +cmp.false: + %a3 = bitcast <28 x i32> %a to <56 x half> + br label %end + +end: + %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x half> %phi +} + define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v56f16_to_v28i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v49 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v35 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v34 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v31 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; GCN-NEXT: v_or_b32_e32 v0, v46, v0 -; GCN-NEXT: v_or_b32_e32 v1, v44, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; GCN-NEXT: v_or_b32_e32 v2, v42, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v51 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v28, v14 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: v_or_b32_e32 v16, v55, v16 -; GCN-NEXT: v_or_b32_e32 v17, v52, v17 -; GCN-NEXT: v_or_b32_e32 v18, v50, v18 -; GCN-NEXT: v_or_b32_e32 v19, v48, v19 -; GCN-NEXT: v_or_b32_e32 v20, v38, v20 -; GCN-NEXT: v_or_b32_e32 v21, v36, v21 -; GCN-NEXT: v_or_b32_e32 v22, v34, v22 -; GCN-NEXT: v_or_b32_e32 v23, v33, v23 -; GCN-NEXT: v_or_b32_e32 v24, v35, v24 -; GCN-NEXT: v_or_b32_e32 v25, v37, v25 -; GCN-NEXT: v_or_b32_e32 v26, v39, v26 -; GCN-NEXT: v_or_b32_e32 v27, v49, v27 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v44 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v41 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v55 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v52 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v50 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v48 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v38 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v36 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_or_b32_e32 v15, v17, v16 -; GCN-NEXT: v_or_b32_e32 v16, v19, v18 -; GCN-NEXT: v_or_b32_e32 v17, v21, v20 -; GCN-NEXT: v_or_b32_e32 v18, v23, v22 -; GCN-NEXT: v_or_b32_e32 v19, v25, v24 -; GCN-NEXT: v_or_b32_e32 v20, v27, v26 -; GCN-NEXT: v_or_b32_e32 v21, v29, v28 -; GCN-NEXT: v_or_b32_e32 v22, v31, v30 -; GCN-NEXT: v_or_b32_e32 v23, v33, v32 -; GCN-NEXT: v_or_b32_e32 v24, v35, v34 -; GCN-NEXT: v_or_b32_e32 v25, v37, v36 -; GCN-NEXT: v_or_b32_e32 v26, v39, v38 -; GCN-NEXT: v_or_b32_e32 v27, v49, v48 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v56f16_to_v28i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_or_b32_e32 v21, v60, v21 +; SI-NEXT: v_or_b32_e32 v22, v58, v22 +; SI-NEXT: v_or_b32_e32 v23, v48, v23 +; SI-NEXT: v_or_b32_e32 v24, v38, v24 +; SI-NEXT: v_or_b32_e32 v25, v36, v25 +; SI-NEXT: v_or_b32_e32 v26, v34, v26 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v56 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63 +; SI-NEXT: v_or_b32_e32 v20, v62, v20 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v56f16_to_v28i32: ; VI: ; %bb.0: @@ -5174,7 +10570,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v27, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -5261,9 +10657,9 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB9_2: ; %Flow +; VI-NEXT: .LBB18_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_4 +; VI-NEXT: s_cbranch_execz .LBB18_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v27, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -5350,7 +10746,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: .LBB9_4: ; %end +; VI-NEXT: .LBB18_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5465,7 +10861,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload @@ -5605,9 +11001,9 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: .LBB9_2: ; %Flow +; GFX9-NEXT: .LBB18_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload @@ -5710,7 +11106,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 ; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_4: ; %end +; GFX9-NEXT: .LBB18_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5739,7 +11135,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -5769,7 +11165,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: .LBB18_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5837,7 +11233,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -5867,7 +11263,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: .LBB18_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5887,47 +11283,1381 @@ end: ret <28 x i32> %phi } +define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56f16_to_v28i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v38, v12 +; SI-NEXT: v_or_b32_e32 v13, v36, v13 +; SI-NEXT: v_or_b32_e32 v14, v35, v14 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_or_b32_e32 v17, v37, v17 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v50, v27 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v52, v37 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_mov_b32_e32 v29, v37 +; SI-NEXT: v_mov_b32_e32 v37, v52 +; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v56f16_to_v28i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v32, v13 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v38, v7 +; VI-NEXT: v_mov_b32_e32 v39, v6 +; VI-NEXT: v_mov_b32_e32 v48, v5 +; VI-NEXT: v_mov_b32_e32 v49, v4 +; VI-NEXT: v_mov_b32_e32 v50, v3 +; VI-NEXT: v_mov_b32_e32 v51, v2 +; VI-NEXT: v_mov_b32_e32 v52, v1 +; VI-NEXT: v_mov_b32_e32 v53, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v27, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v26, v28, v26 +; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v56f16_to_v28i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v13 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v34, v11 +; GFX9-NEXT: v_mov_b32_e32 v35, v10 +; GFX9-NEXT: v_mov_b32_e32 v36, v9 +; GFX9-NEXT: v_mov_b32_e32 v37, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v7 +; GFX9-NEXT: v_mov_b32_e32 v39, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v5 +; GFX9-NEXT: v_mov_b32_e32 v49, v4 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v51, v2 +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v27 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v33, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v7 :: v_dual_mov_b32 v35, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v5 :: v_dual_mov_b32 v37, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_mov_b32 v49, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB19_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB19_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB19_2 +; +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB19_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB19_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB19_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <56 x half> %a, splat (half 0xH0200) + %a2 = bitcast <56 x half> %a1 to <28 x i32> + br label %end + +cmp.false: + %a3 = bitcast <56 x half> %a to <28 x i32> + br label %end + +end: + %phi = phi <28 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i32> %phi +} + define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f32_to_v14i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f32_to_v14i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f32_to_v14i64: ; VI: ; %bb.0: @@ -5936,7 +12666,7 @@ define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 @@ -5966,7 +12696,7 @@ define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5977,7 +12707,7 @@ define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 ; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 @@ -6007,7 +12737,7 @@ define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6019,7 +12749,7 @@ define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 ; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 @@ -6035,7 +12765,7 @@ define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6055,47 +12785,323 @@ end: ret <14 x i64> %phi } +define inreg <14 x i64> @bitcast_v28f32_to_v14i64_scalar(<28 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f32_to_v14i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v28f32_to_v14i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v28f32_to_v14i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v28f32_to_v14i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB21_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: .LBB21_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <28 x float> %a1 to <14 x i64> + br label %end + +cmp.false: + %a3 = bitcast <28 x float> %a to <14 x i64> + br label %end + +end: + %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i64> %phi +} + define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i64_to_v28f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i64_to_v28f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i64_to_v28f32: ; VI: ; %bb.0: @@ -6104,7 +13110,7 @@ define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 ; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc @@ -6134,7 +13140,7 @@ define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6145,7 +13151,7 @@ define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 ; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc @@ -6175,7 +13181,7 @@ define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6187,7 +13193,7 @@ define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -6224,7 +13230,7 @@ define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6244,47 +13250,344 @@ end: ret <28 x float> %phi } +define inreg <28 x float> @bitcast_v14i64_to_v28f32_scalar(<14 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i64_to_v28f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v14i64_to_v28f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v14i64_to_v28f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v14i64_to_v28f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB23_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: .LBB23_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i64> %a, splat (i64 3) + %a2 = bitcast <14 x i64> %a1 to <28 x float> + br label %end + +cmp.false: + %a3 = bitcast <14 x i64> %a to <28 x float> + br label %end + +end: + %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x float> %phi +} + define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f32_to_v14f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f32_to_v14f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f32_to_v14f64: ; VI: ; %bb.0: @@ -6293,7 +13596,7 @@ define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 @@ -6323,7 +13626,7 @@ define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end +; VI-NEXT: .LBB24_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6334,7 +13637,7 @@ define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 ; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 @@ -6364,7 +13667,7 @@ define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: .LBB24_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6376,7 +13679,7 @@ define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 ; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 @@ -6392,7 +13695,7 @@ define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB12_2: ; %end +; GFX11-NEXT: .LBB24_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6412,69 +13715,345 @@ end: ret <14 x double> %phi } -define <28 x float> @bitcast_v14f64_to_v28f32(<14 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f64_to_v28f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <14 x double> @bitcast_v28f32_to_v14f64_scalar(<28 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f32_to_v14f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: s_branch .LBB25_2 ; -; VI-LABEL: bitcast_v14f64_to_v28f32: +; VI-LABEL: bitcast_v28f32_to_v14f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v14f64_to_v28f32: +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v28f32_to_v14f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-LABEL: bitcast_v28f32_to_v14f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB25_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <28 x float> %a1 to <14 x double> + br label %end + +cmp.false: + %a3 = bitcast <28 x float> %a to <14 x double> + br label %end + +end: + %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x double> %phi +} + +define <28 x float> @bitcast_v14f64_to_v28f32(<14 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v14f64_to_v28f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f64_to_v28f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f64_to_v28f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -6490,7 +14069,7 @@ define <28 x float> @bitcast_v14f64_to_v28f32(<14 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6502,7 +14081,7 @@ define <28 x float> @bitcast_v14f64_to_v28f32(<14 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -6518,7 +14097,7 @@ define <28 x float> @bitcast_v14f64_to_v28f32(<14 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6538,312 +14117,569 @@ end: ret <28 x float> %phi } +define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f64_to_v28f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v29, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: v_mov_b32_e32 v15, v29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v14f64_to_v28f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v29, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: v_mov_b32_e32 v15, v29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v14f64_to_v28f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v29, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: v_mov_b32_e32 v15, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v14f64_to_v28f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB27_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: .LBB27_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <14 x double> %a1 to <28 x float> + br label %end + +cmp.false: + %a3 = bitcast <14 x double> %a to <28 x float> + br label %end + +end: + %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x float> %phi +} + define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f32_to_v56i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v35, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v48, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v50, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v52, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v41, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v43, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v35, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v48, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v50, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v52, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v41, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v43, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_and_b32_e32 v60, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_and_b32_e32 v61, 0xffff, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v41 -; GCN-NEXT: v_or_b32_e32 v5, v57, v45 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v41, v58, v56 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v59, v43 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v43, v60, v47 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v47, v61, v62 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v45, v45, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v56, v56, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v44, v58, v44 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v9, v9, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v10, v10, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v11, v11, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v12, v12, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v13, v13, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v14, v14, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v15, v15, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v16, v16, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v17, v17, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v18, v18, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v19, v19, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v20, v20, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v21, v21, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v22, v22, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v23, v23, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v24, v24, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x6c, v0 -; GCN-NEXT: v_or_b32_e32 v25, v25, v30 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v29 -; GCN-NEXT: v_or_b32_e32 v28, v28, v34 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f32_to_v56i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v39, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v49, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v54, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v39, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v49, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v54, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f32_to_v56i16: ; VI: ; %bb.0: @@ -6887,7 +14723,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -6917,9 +14753,9 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB14_2: ; %Flow +; VI-NEXT: .LBB28_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_4 +; VI-NEXT: s_cbranch_execz .LBB28_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 @@ -6977,7 +14813,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB14_4: ; %end +; VI-NEXT: .LBB28_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 ; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 @@ -7088,7 +14924,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -7118,9 +14954,9 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB14_2: ; %Flow +; GFX9-NEXT: .LBB28_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: s_cbranch_execz .LBB28_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 ; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 @@ -7178,7 +15014,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB14_4: ; %end +; GFX9-NEXT: .LBB28_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 @@ -7228,7 +15064,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 @@ -7244,7 +15080,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7283,7 +15119,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -7313,9 +15149,9 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB28_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 @@ -7359,7 +15195,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: .LBB28_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 @@ -7408,633 +15244,556 @@ end: ret <56 x i16> %phi } -define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v56i16_to_v28f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v41 -; GCN-NEXT: v_or_b32_e32 v1, v1, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v63 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v62 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v61 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v60 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v59 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v58 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v56 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v47 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v11, v11, v46 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v44 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v43 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v57 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v28 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: .LBB15_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v57 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v0, v41, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_or_b32_e32 v2, v63, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v62, v4 -; GCN-NEXT: v_or_b32_e32 v5, v61, v5 -; GCN-NEXT: v_or_b32_e32 v6, v60, v6 -; GCN-NEXT: v_or_b32_e32 v7, v59, v7 -; GCN-NEXT: v_or_b32_e32 v8, v58, v8 -; GCN-NEXT: v_or_b32_e32 v9, v56, v9 -; GCN-NEXT: v_or_b32_e32 v10, v47, v10 -; GCN-NEXT: v_or_b32_e32 v11, v46, v11 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v28, v12 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v28, v13 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v28, v14 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v28, v16 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v28, v17 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v28, v18 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v28, v19 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v28, v20 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v28, v21 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v28, v22 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v28, v23 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v28, v24 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v28, v25 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v28, v26 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 -; GCN-NEXT: .LBB15_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f32_to_v56i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_mov_b32_e32 v29, s17 +; SI-NEXT: v_mov_b32_e32 v25, s18 +; SI-NEXT: v_mov_b32_e32 v23, s19 +; SI-NEXT: v_mov_b32_e32 v28, s20 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v26, s21 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v22, s23 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_mov_b32_e32 v17, s27 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v15, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v27, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v32, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v39, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v49, v17, v18, 16 +; SI-NEXT: v_alignbit_b32 v51, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v54, v22, v24, 16 +; SI-NEXT: v_alignbit_b32 v40, v26, v28, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v23, v25, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v29, v30, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v29 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v27, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v32, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v39, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v49, v17, v18, 16 +; SI-NEXT: v_alignbit_b32 v51, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v54, v22, v24, 16 +; SI-NEXT: v_alignbit_b32 v40, v26, v28, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v23, v25, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v29, v30, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v29 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_or_b32_e32 v30, v30, v44 +; SI-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v56 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v42 +; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v25, v29, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v47 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v40 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v46 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v54 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v45 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v55 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_branch .LBB29_2 ; -; VI-LABEL: bitcast_v56i16_to_v28f32: +; VI-LABEL: bitcast_v28f32_to_v56i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v27 -; VI-NEXT: v_mov_b32_e32 v33, v26 -; VI-NEXT: v_mov_b32_e32 v34, v25 -; VI-NEXT: v_mov_b32_e32 v35, v24 -; VI-NEXT: v_mov_b32_e32 v36, v23 -; VI-NEXT: v_mov_b32_e32 v37, v22 -; VI-NEXT: v_mov_b32_e32 v38, v21 -; VI-NEXT: v_mov_b32_e32 v39, v20 -; VI-NEXT: v_mov_b32_e32 v48, v19 -; VI-NEXT: v_mov_b32_e32 v49, v18 -; VI-NEXT: v_mov_b32_e32 v50, v17 -; VI-NEXT: v_mov_b32_e32 v51, v16 -; VI-NEXT: v_mov_b32_e32 v52, v15 -; VI-NEXT: v_mov_b32_e32 v53, v14 -; VI-NEXT: v_mov_b32_e32 v54, v13 -; VI-NEXT: v_mov_b32_e32 v55, v12 -; VI-NEXT: v_mov_b32_e32 v40, v11 -; VI-NEXT: v_mov_b32_e32 v41, v10 -; VI-NEXT: v_mov_b32_e32 v42, v9 -; VI-NEXT: v_mov_b32_e32 v43, v8 -; VI-NEXT: v_mov_b32_e32 v44, v7 -; VI-NEXT: v_mov_b32_e32 v45, v6 -; VI-NEXT: v_mov_b32_e32 v46, v5 -; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v56, v3 -; VI-NEXT: v_mov_b32_e32 v57, v2 -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_mov_b32_e32 v59, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v20, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v26, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v27, s22 +; VI-NEXT: v_mov_b32_e32 v25, s23 +; VI-NEXT: v_mov_b32_e32 v24, s24 +; VI-NEXT: v_mov_b32_e32 v23, s25 +; VI-NEXT: v_mov_b32_e32 v22, s26 +; VI-NEXT: v_mov_b32_e32 v21, s27 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB29_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v27, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v27, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v27, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v27, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v27, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v27, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v27, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v27, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v27, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v27, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v27, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v27, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v46, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v45, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v44, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v43, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v41, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v53, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v52, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v36, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v35, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v34, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v33, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v20, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; VI-NEXT: v_or_b32_sdwa v38, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v39 +; VI-NEXT: v_or_b32_sdwa v39, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; VI-NEXT: v_or_b32_sdwa v48, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 +; VI-NEXT: v_or_b32_sdwa v49, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v28 +; VI-NEXT: v_mov_b32_e32 v1, v29 +; VI-NEXT: v_mov_b32_e32 v2, v30 +; VI-NEXT: v_mov_b32_e32 v3, v31 +; VI-NEXT: v_mov_b32_e32 v4, v32 +; VI-NEXT: v_mov_b32_e32 v5, v33 +; VI-NEXT: v_mov_b32_e32 v6, v34 +; VI-NEXT: v_mov_b32_e32 v7, v35 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v10, v38 +; VI-NEXT: v_mov_b32_e32 v11, v39 +; VI-NEXT: v_mov_b32_e32 v12, v48 +; VI-NEXT: v_mov_b32_e32 v13, v49 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr45 @@ -8049,266 +15808,219 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB15_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v27, 3 -; VI-NEXT: v_add_u16_e32 v0, 3, v59 -; VI-NEXT: v_add_u16_sdwa v1, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v58 -; VI-NEXT: v_add_u16_sdwa v3, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: v_add_u16_e32 v2, 3, v57 -; VI-NEXT: v_add_u16_sdwa v3, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v3, 3, v56 -; VI-NEXT: v_add_u16_sdwa v4, v56, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_add_u16_e32 v4, 3, v47 -; VI-NEXT: v_add_u16_sdwa v5, v47, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_add_u16_e32 v5, 3, v46 -; VI-NEXT: v_add_u16_sdwa v6, v46, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v5, v6 -; VI-NEXT: v_add_u16_e32 v6, 3, v45 -; VI-NEXT: v_add_u16_sdwa v7, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_add_u16_e32 v7, 3, v44 -; VI-NEXT: v_add_u16_sdwa v8, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v43 -; VI-NEXT: v_add_u16_sdwa v9, v43, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v42 -; VI-NEXT: v_add_u16_sdwa v10, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v9, v10 -; VI-NEXT: v_add_u16_e32 v10, 3, v41 -; VI-NEXT: v_add_u16_sdwa v11, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v40 -; VI-NEXT: v_add_u16_sdwa v12, v40, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v55 -; VI-NEXT: v_add_u16_sdwa v13, v55, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v12, v13 -; VI-NEXT: v_add_u16_e32 v13, 3, v54 -; VI-NEXT: v_add_u16_sdwa v14, v54, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v53 -; VI-NEXT: v_add_u16_sdwa v15, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: v_add_u16_e32 v15, 3, v52 -; VI-NEXT: v_add_u16_sdwa v16, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: v_add_u16_e32 v16, 3, v51 -; VI-NEXT: v_add_u16_sdwa v17, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v16, v16, v17 -; VI-NEXT: v_add_u16_e32 v17, 3, v50 -; VI-NEXT: v_add_u16_sdwa v18, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v17, v17, v18 -; VI-NEXT: v_add_u16_e32 v18, 3, v49 -; VI-NEXT: v_add_u16_sdwa v19, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v18, v18, v19 -; VI-NEXT: v_add_u16_e32 v19, 3, v48 -; VI-NEXT: v_add_u16_sdwa v20, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: v_add_u16_e32 v20, 3, v39 -; VI-NEXT: v_add_u16_sdwa v21, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v20, v20, v21 -; VI-NEXT: v_add_u16_e32 v21, 3, v38 -; VI-NEXT: v_add_u16_sdwa v22, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: v_add_u16_e32 v22, 3, v37 -; VI-NEXT: v_add_u16_sdwa v23, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v22, v22, v23 -; VI-NEXT: v_add_u16_e32 v23, 3, v36 -; VI-NEXT: v_add_u16_sdwa v24, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v23, v23, v24 -; VI-NEXT: v_add_u16_e32 v24, 3, v35 -; VI-NEXT: v_add_u16_sdwa v25, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: v_add_u16_e32 v25, 3, v34 -; VI-NEXT: v_add_u16_sdwa v26, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v25, v25, v26 -; VI-NEXT: v_add_u16_e32 v26, 3, v33 -; VI-NEXT: v_add_u16_sdwa v28, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v26, v26, v28 -; VI-NEXT: v_add_u16_e32 v28, 3, v32 -; VI-NEXT: v_add_u16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: .LBB15_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: s_branch .LBB29_2 ; -; GFX9-LABEL: bitcast_v56i16_to_v28f32: +; GFX9-LABEL: bitcast_v28f32_to_v56i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v59, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v37, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v38, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; GFX9-NEXT: v_mov_b32_e32 v39, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GFX9-NEXT: v_mov_b32_e32 v48, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v9 -; GFX9-NEXT: v_mov_b32_e32 v43, v8 -; GFX9-NEXT: v_mov_b32_e32 v44, v7 -; GFX9-NEXT: v_mov_b32_e32 v45, v6 -; GFX9-NEXT: v_mov_b32_e32 v46, v5 -; GFX9-NEXT: v_mov_b32_e32 v47, v4 -; GFX9-NEXT: v_mov_b32_e32 v56, v3 -; GFX9-NEXT: v_mov_b32_e32 v57, v2 -; GFX9-NEXT: v_mov_b32_e32 v58, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v26, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v27, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v24, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v21, s27 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v27 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-NEXT: v_mov_b32_e32 v3, v31 +; GFX9-NEXT: v_mov_b32_e32 v4, v32 +; GFX9-NEXT: v_mov_b32_e32 v5, v33 +; GFX9-NEXT: v_mov_b32_e32 v6, v34 +; GFX9-NEXT: v_mov_b32_e32 v7, v35 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v9, v37 +; GFX9-NEXT: v_mov_b32_e32 v10, v38 +; GFX9-NEXT: v_mov_b32_e32 v11, v39 +; GFX9-NEXT: v_mov_b32_e32 v12, v48 +; GFX9-NEXT: v_mov_b32_e32 v13, v49 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr45 @@ -8323,1007 +16035,994 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: .LBB15_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_branch .LBB29_2 ; -; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28f32: +; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56i16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB15_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v17, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s27 :: v_dual_mov_b32 v13, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-TRUE16-NEXT: .LBB29_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v35, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v67, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v65, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v37, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v66, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v34, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v71, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v69, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v64, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB29_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB29_2 ; -; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28f32: +; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56i16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB15_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v15, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-FAKE16-NEXT: .LBB29_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v30 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v28 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB29_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: s_branch .LBB29_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <56 x i16> %a, splat (i16 3) - %a2 = bitcast <56 x i16> %a1 to <28 x float> + %a1 = fadd <28 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <28 x float> %a1 to <56 x i16> br label %end cmp.false: - %a3 = bitcast <56 x i16> %a to <28 x float> + %a3 = bitcast <28 x float> %a to <56 x i16> br label %end end: - %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <28 x float> %phi + %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x i16> %phi } -define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f32_to_v56f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v40 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v44 -; GCN-NEXT: v_mov_b32_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v46 -; GCN-NEXT: v_mov_b32_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v56 -; GCN-NEXT: v_mov_b32_e32 v56, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v58 -; GCN-NEXT: v_mov_b32_e32 v58, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v60 -; GCN-NEXT: v_mov_b32_e32 v60, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v62 -; GCN-NEXT: v_mov_b32_e32 v62, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_mov_b32_e32 v43, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v45 -; GCN-NEXT: v_mov_b32_e32 v45, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v47 -; GCN-NEXT: v_mov_b32_e32 v47, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v57 -; GCN-NEXT: v_mov_b32_e32 v57, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v59 -; GCN-NEXT: v_mov_b32_e32 v59, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v61 -; GCN-NEXT: v_mov_b32_e32 v61, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: .LBB16_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v63 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: .LBB16_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v39 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v37 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v5, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v7, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v49 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v48 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v38 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v36 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v34 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v21, v21, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v32 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v23, v23, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v25, v25, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v61 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v59 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v57 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v47 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v45 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v43 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v42 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v44 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v46 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v56 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v58 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x68, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v60 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x6c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v62 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_or_b32_e32 v45, v47, v46 -; GCN-NEXT: v_or_b32_e32 v46, v57, v56 -; GCN-NEXT: v_or_b32_e32 v47, v59, v58 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v56i16_to_v28f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v59 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_or_b32_e32 v2, v2, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v49 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v3, v3, v57 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v6, v6, v46 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v45 +; SI-NEXT: v_or_b32_e32 v9, v9, v44 +; SI-NEXT: v_or_b32_e32 v10, v10, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_or_b32_e32 v13, v13, v63 +; SI-NEXT: v_or_b32_e32 v14, v14, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v56 +; SI-NEXT: v_or_b32_e32 v18, v18, v47 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v20, v20, v36 +; SI-NEXT: v_or_b32_e32 v21, v21, v35 +; SI-NEXT: v_or_b32_e32 v22, v22, v43 +; SI-NEXT: v_or_b32_e32 v23, v23, v42 +; SI-NEXT: v_or_b32_e32 v24, v24, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v40 +; SI-NEXT: v_or_b32_e32 v26, v26, v62 +; SI-NEXT: v_or_b32_e32 v27, v27, v60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: .LBB30_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v16, v49, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v3, v57, v3 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: v_or_b32_e32 v9, v44, v9 +; SI-NEXT: v_or_b32_e32 v10, v34, v10 +; SI-NEXT: v_or_b32_e32 v11, v33, v11 +; SI-NEXT: v_or_b32_e32 v12, v32, v12 +; SI-NEXT: v_or_b32_e32 v13, v63, v13 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: v_or_b32_e32 v15, v50, v15 +; SI-NEXT: v_or_b32_e32 v17, v56, v17 +; SI-NEXT: v_or_b32_e32 v18, v47, v18 +; SI-NEXT: v_or_b32_e32 v19, v38, v19 +; SI-NEXT: v_or_b32_e32 v20, v36, v20 +; SI-NEXT: v_or_b32_e32 v21, v35, v21 +; SI-NEXT: v_or_b32_e32 v22, v43, v22 +; SI-NEXT: v_or_b32_e32 v23, v42, v23 +; SI-NEXT: v_or_b32_e32 v24, v41, v24 +; SI-NEXT: v_or_b32_e32 v25, v40, v25 +; SI-NEXT: v_or_b32_e32 v26, v62, v26 +; SI-NEXT: v_or_b32_e32 v27, v60, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 +; SI-NEXT: .LBB30_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v28f32_to_v56f16: +; VI-LABEL: bitcast_v56i16_to_v28f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v27 +; VI-NEXT: v_mov_b32_e32 v33, v26 +; VI-NEXT: v_mov_b32_e32 v34, v25 +; VI-NEXT: v_mov_b32_e32 v35, v24 +; VI-NEXT: v_mov_b32_e32 v36, v23 +; VI-NEXT: v_mov_b32_e32 v37, v22 +; VI-NEXT: v_mov_b32_e32 v38, v21 +; VI-NEXT: v_mov_b32_e32 v39, v20 +; VI-NEXT: v_mov_b32_e32 v48, v19 +; VI-NEXT: v_mov_b32_e32 v49, v18 +; VI-NEXT: v_mov_b32_e32 v50, v17 +; VI-NEXT: v_mov_b32_e32 v51, v16 +; VI-NEXT: v_mov_b32_e32 v52, v15 +; VI-NEXT: v_mov_b32_e32 v53, v14 +; VI-NEXT: v_mov_b32_e32 v54, v13 +; VI-NEXT: v_mov_b32_e32 v55, v12 +; VI-NEXT: v_mov_b32_e32 v40, v11 +; VI-NEXT: v_mov_b32_e32 v41, v10 +; VI-NEXT: v_mov_b32_e32 v42, v9 +; VI-NEXT: v_mov_b32_e32 v43, v8 +; VI-NEXT: v_mov_b32_e32 v44, v7 +; VI-NEXT: v_mov_b32_e32 v45, v6 +; VI-NEXT: v_mov_b32_e32 v46, v5 +; VI-NEXT: v_mov_b32_e32 v47, v4 +; VI-NEXT: v_mov_b32_e32 v56, v3 +; VI-NEXT: v_mov_b32_e32 v57, v2 +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_mov_b32_e32 v59, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB30_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v27, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v27, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v27, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v27, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v27, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v27, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v27, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v27, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v27, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v27, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v27, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v27, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v46, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v45, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v44, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v43, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v41, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v53, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v36, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v35, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v34, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr45 @@ -9348,1342 +17047,8366 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB16_2: ; %Flow +; VI-NEXT: .LBB30_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_4 +; VI-NEXT: s_cbranch_execz .LBB30_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB16_4: ; %end +; VI-NEXT: v_mov_b32_e32 v27, 3 +; VI-NEXT: v_add_u16_e32 v0, 3, v59 +; VI-NEXT: v_add_u16_sdwa v1, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 3, v58 +; VI-NEXT: v_add_u16_sdwa v3, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v57 +; VI-NEXT: v_add_u16_sdwa v3, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v56 +; VI-NEXT: v_add_u16_sdwa v4, v56, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v47 +; VI-NEXT: v_add_u16_sdwa v5, v47, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v46 +; VI-NEXT: v_add_u16_sdwa v6, v46, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v45 +; VI-NEXT: v_add_u16_sdwa v7, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u16_e32 v7, 3, v44 +; VI-NEXT: v_add_u16_sdwa v8, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v43 +; VI-NEXT: v_add_u16_sdwa v9, v43, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v42 +; VI-NEXT: v_add_u16_sdwa v10, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v41 +; VI-NEXT: v_add_u16_sdwa v11, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v40 +; VI-NEXT: v_add_u16_sdwa v12, v40, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v55 +; VI-NEXT: v_add_u16_sdwa v13, v55, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v54 +; VI-NEXT: v_add_u16_sdwa v14, v54, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v53 +; VI-NEXT: v_add_u16_sdwa v15, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: v_add_u16_e32 v15, 3, v52 +; VI-NEXT: v_add_u16_sdwa v16, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: v_add_u16_e32 v16, 3, v51 +; VI-NEXT: v_add_u16_sdwa v17, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: v_add_u16_e32 v17, 3, v50 +; VI-NEXT: v_add_u16_sdwa v18, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_add_u16_e32 v18, 3, v49 +; VI-NEXT: v_add_u16_sdwa v19, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: v_add_u16_e32 v19, 3, v48 +; VI-NEXT: v_add_u16_sdwa v20, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_add_u16_e32 v20, 3, v39 +; VI-NEXT: v_add_u16_sdwa v21, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: v_add_u16_e32 v21, 3, v38 +; VI-NEXT: v_add_u16_sdwa v22, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v21, v22 +; VI-NEXT: v_add_u16_e32 v22, 3, v37 +; VI-NEXT: v_add_u16_sdwa v23, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v22, v23 +; VI-NEXT: v_add_u16_e32 v23, 3, v36 +; VI-NEXT: v_add_u16_sdwa v24, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v23, v24 +; VI-NEXT: v_add_u16_e32 v24, 3, v35 +; VI-NEXT: v_add_u16_sdwa v25, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v24, v25 +; VI-NEXT: v_add_u16_e32 v25, 3, v34 +; VI-NEXT: v_add_u16_sdwa v26, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v25, v26 +; VI-NEXT: v_add_u16_e32 v26, 3, v33 +; VI-NEXT: v_add_u16_sdwa v28, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v26, v28 +; VI-NEXT: v_add_u16_e32 v28, 3, v32 +; VI-NEXT: v_add_u16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: .LBB30_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v28f32_to_v56f16: +; GFX9-LABEL: bitcast_v56i16_to_v28f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v59, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v37, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v38, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; GFX9-NEXT: v_mov_b32_e32 v39, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; GFX9-NEXT: v_mov_b32_e32 v48, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v9 +; GFX9-NEXT: v_mov_b32_e32 v43, v8 +; GFX9-NEXT: v_mov_b32_e32 v44, v7 +; GFX9-NEXT: v_mov_b32_e32 v45, v6 +; GFX9-NEXT: v_mov_b32_e32 v46, v5 +; GFX9-NEXT: v_mov_b32_e32 v47, v4 +; GFX9-NEXT: v_mov_b32_e32 v56, v3 +; GFX9-NEXT: v_mov_b32_e32 v57, v2 +; GFX9-NEXT: v_mov_b32_e32 v58, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB16_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB16_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: .LBB30_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB30_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB30_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB30_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB30_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <56 x i16> %a, splat (i16 3) + %a2 = bitcast <56 x i16> %a1 to <28 x float> + br label %end + +cmp.false: + %a3 = bitcast <56 x i16> %a to <28 x float> + br label %end + +end: + %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x float> %phi +} + +define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56i16_to_v28f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 +; SI-NEXT: v_mov_b32_e32 v32, v26 +; SI-NEXT: v_mov_b32_e32 v33, v24 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v12 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v12, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v13, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v14, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v16, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v17, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v8, v1, v18 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v27, v0, v63 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v41 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mov_b32_e32 v41, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: v_mov_b32_e32 v42, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v60 +; SI-NEXT: v_mov_b32_e32 v60, v57 +; SI-NEXT: v_mov_b32_e32 v57, v46 +; SI-NEXT: v_mov_b32_e32 v46, v61 +; SI-NEXT: v_mov_b32_e32 v61, v58 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v31, v47 +; SI-NEXT: v_mov_b32_e32 v47, v58 +; SI-NEXT: v_mov_b32_e32 v58, v61 +; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v46, v57 +; SI-NEXT: v_mov_b32_e32 v57, v60 +; SI-NEXT: v_mov_b32_e32 v60, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v48 +; SI-NEXT: v_mov_b32_e32 v48, v42 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v33, v36 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: v_mov_b32_e32 v56, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v56i16_to_v28f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v39, v13 +; VI-NEXT: v_mov_b32_e32 v37, v12 +; VI-NEXT: v_mov_b32_e32 v35, v11 +; VI-NEXT: v_mov_b32_e32 v34, v10 +; VI-NEXT: v_mov_b32_e32 v33, v9 +; VI-NEXT: v_mov_b32_e32 v32, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v38, v6 +; VI-NEXT: v_mov_b32_e32 v48, v5 +; VI-NEXT: v_mov_b32_e32 v49, v4 +; VI-NEXT: v_mov_b32_e32 v50, v3 +; VI-NEXT: v_mov_b32_e32 v51, v2 +; VI-NEXT: v_mov_b32_e32 v53, v1 +; VI-NEXT: v_mov_b32_e32 v52, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: v_or_b32_sdwa v14, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_or_b32_sdwa v20, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v22, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v23, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v24, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v56i16_to_v28f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v13 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v34, v11 +; GFX9-NEXT: v_mov_b32_e32 v35, v10 +; GFX9-NEXT: v_mov_b32_e32 v36, v9 +; GFX9-NEXT: v_mov_b32_e32 v37, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v7 +; GFX9-NEXT: v_mov_b32_e32 v39, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v5 +; GFX9-NEXT: v_mov_b32_e32 v49, v4 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v51, v2 +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v27 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v33, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v7 :: v_dual_mov_b32 v35, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v5 :: v_dual_mov_b32 v37, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_mov_b32 v49, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB31_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB31_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB31_2 +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB31_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB31_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB31_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <56 x i16> %a, splat (i16 3) + %a2 = bitcast <56 x i16> %a1 to <28 x float> + br label %end + +cmp.false: + %a3 = bitcast <56 x i16> %a to <28 x float> + br label %end + +end: + %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x float> %phi +} + +define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v28f32_to_v56f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_mov_b32_e32 v29, v28 +; SI-NEXT: v_mov_b32_e32 v57, v25 +; SI-NEXT: v_mov_b32_e32 v47, v26 +; SI-NEXT: v_mov_b32_e32 v45, v27 +; SI-NEXT: v_mov_b32_e32 v43, v1 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: .LBB32_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v28f32_to_v56f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB32_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB32_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB32_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB32_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v28f32_to_v56f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB32_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB32_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB32_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB32_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB32_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <28 x float> %a1 to <56 x half> + br label %end + +cmp.false: + %a3 = bitcast <28 x float> %a to <56 x half> + br label %end + +end: + %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x half> %phi +} + +define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f32_to_v56f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-NEXT: v_readfirstlane_b32 s43, v1 +; SI-NEXT: v_readfirstlane_b32 s42, v2 +; SI-NEXT: v_readfirstlane_b32 s41, v3 +; SI-NEXT: v_readfirstlane_b32 s40, v4 +; SI-NEXT: v_readfirstlane_b32 s15, v5 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v10 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: v_readfirstlane_b32 s7, v12 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s6 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v57, s7 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s8 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_add_f32_e64 v14, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v36, s6, 1.0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e64 v10, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v31 +; SI-NEXT: v_add_f32_e64 v12, s10, 1.0 +; SI-NEXT: v_add_f32_e64 v33, s7, 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s43, 1.0 +; SI-NEXT: v_add_f32_e64 v27, s42, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s41, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s40, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s12, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v33 +; SI-NEXT: v_add_f32_e64 v48, s9, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v45 +; SI-NEXT: v_mov_b32_e32 v45, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: v_mov_b32_e32 v43, v34 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v6, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 +; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v8, v34, v8 +; SI-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 +; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 +; SI-NEXT: v_add_i32_e32 v31, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v31, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v11 +; SI-NEXT: v_add_i32_e32 v31, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 +; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v36 +; SI-NEXT: v_add_i32_e32 v13, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 +; SI-NEXT: v_add_i32_e32 v12, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 +; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v27 +; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v25 +; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v21 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v19 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v59 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v43 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v28f32_to_v56f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v20, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v26, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v27, s22 +; VI-NEXT: v_mov_b32_e32 v25, s23 +; VI-NEXT: v_mov_b32_e32 v24, s24 +; VI-NEXT: v_mov_b32_e32 v23, s25 +; VI-NEXT: v_mov_b32_e32 v22, s26 +; VI-NEXT: v_mov_b32_e32 v21, s27 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB33_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v20, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; VI-NEXT: v_or_b32_sdwa v38, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v39 +; VI-NEXT: v_or_b32_sdwa v39, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; VI-NEXT: v_or_b32_sdwa v48, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 +; VI-NEXT: v_or_b32_sdwa v49, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v28 +; VI-NEXT: v_mov_b32_e32 v1, v29 +; VI-NEXT: v_mov_b32_e32 v2, v30 +; VI-NEXT: v_mov_b32_e32 v3, v31 +; VI-NEXT: v_mov_b32_e32 v4, v32 +; VI-NEXT: v_mov_b32_e32 v5, v33 +; VI-NEXT: v_mov_b32_e32 v6, v34 +; VI-NEXT: v_mov_b32_e32 v7, v35 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v10, v38 +; VI-NEXT: v_mov_b32_e32 v11, v39 +; VI-NEXT: v_mov_b32_e32 v12, v48 +; VI-NEXT: v_mov_b32_e32 v13, v49 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: s_branch .LBB33_2 +; +; GFX9-LABEL: bitcast_v28f32_to_v56f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v26, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v27, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v24, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v21, s27 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v27 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-NEXT: v_mov_b32_e32 v3, v31 +; GFX9-NEXT: v_mov_b32_e32 v4, v32 +; GFX9-NEXT: v_mov_b32_e32 v5, v33 +; GFX9-NEXT: v_mov_b32_e32 v6, v34 +; GFX9-NEXT: v_mov_b32_e32 v7, v35 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v9, v37 +; GFX9-NEXT: v_mov_b32_e32 v10, v38 +; GFX9-NEXT: v_mov_b32_e32 v11, v39 +; GFX9-NEXT: v_mov_b32_e32 v12, v48 +; GFX9-NEXT: v_mov_b32_e32 v13, v49 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_branch .LBB33_2 +; +; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v17, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s27 :: v_dual_mov_b32 v13, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-TRUE16-NEXT: .LBB33_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v35, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v67, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v65, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v37, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v66, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v34, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v71, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v69, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v64, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB33_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB33_2 +; +; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v15, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-FAKE16-NEXT: .LBB33_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v30 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB33_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: s_branch .LBB33_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <28 x float> %a1 to <56 x half> + br label %end + +cmp.false: + %a3 = bitcast <28 x float> %a to <56 x half> + br label %end + +end: + %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x half> %phi +} + +define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v56f16_to_v28f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_or_b32_e32 v21, v60, v21 +; SI-NEXT: v_or_b32_e32 v22, v58, v22 +; SI-NEXT: v_or_b32_e32 v23, v48, v23 +; SI-NEXT: v_or_b32_e32 v24, v38, v24 +; SI-NEXT: v_or_b32_e32 v25, v36, v25 +; SI-NEXT: v_or_b32_e32 v26, v34, v26 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v56 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63 +; SI-NEXT: v_or_b32_e32 v20, v62, v20 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v56f16_to_v28f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v27 +; VI-NEXT: v_mov_b32_e32 v33, v26 +; VI-NEXT: v_mov_b32_e32 v34, v25 +; VI-NEXT: v_mov_b32_e32 v35, v24 +; VI-NEXT: v_mov_b32_e32 v36, v23 +; VI-NEXT: v_mov_b32_e32 v37, v22 +; VI-NEXT: v_mov_b32_e32 v38, v21 +; VI-NEXT: v_mov_b32_e32 v39, v20 +; VI-NEXT: v_mov_b32_e32 v48, v19 +; VI-NEXT: v_mov_b32_e32 v49, v18 +; VI-NEXT: v_mov_b32_e32 v50, v17 +; VI-NEXT: v_mov_b32_e32 v51, v16 +; VI-NEXT: v_mov_b32_e32 v52, v15 +; VI-NEXT: v_mov_b32_e32 v53, v14 +; VI-NEXT: v_mov_b32_e32 v54, v13 +; VI-NEXT: v_mov_b32_e32 v55, v12 +; VI-NEXT: v_mov_b32_e32 v40, v11 +; VI-NEXT: v_mov_b32_e32 v41, v10 +; VI-NEXT: v_mov_b32_e32 v42, v9 +; VI-NEXT: v_mov_b32_e32 v43, v8 +; VI-NEXT: v_mov_b32_e32 v44, v7 +; VI-NEXT: v_mov_b32_e32 v45, v6 +; VI-NEXT: v_mov_b32_e32 v46, v5 +; VI-NEXT: v_mov_b32_e32 v47, v4 +; VI-NEXT: v_mov_b32_e32 v56, v3 +; VI-NEXT: v_mov_b32_e32 v57, v2 +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_mov_b32_e32 v59, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v27, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v27, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v27, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v27, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v27, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v27, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v27, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v27, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v27, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v27, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v27, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v27, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v46, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v45, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v44, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v43, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v41, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v53, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v36, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v35, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v34, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: .LBB34_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v27, 0x200 +; VI-NEXT: v_add_f16_sdwa v0, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v59 +; VI-NEXT: v_add_f16_sdwa v2, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v58 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v57 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_f16_sdwa v3, v56, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v56 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_add_f16_sdwa v4, v47, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v47 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_add_f16_sdwa v5, v46, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v46 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_add_f16_sdwa v6, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v45 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_add_f16_sdwa v7, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v44 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_add_f16_sdwa v8, v43, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v43 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_add_f16_sdwa v9, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v42 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_f16_sdwa v10, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v41 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_add_f16_sdwa v11, v40, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v40 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_add_f16_sdwa v12, v55, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v55 +; VI-NEXT: v_or_b32_e32 v12, v13, v12 +; VI-NEXT: v_add_f16_sdwa v13, v54, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v54 +; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_add_f16_sdwa v14, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v26, v28, v26 +; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: .LBB34_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v56f16_to_v28f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v59, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v37, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v38, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; GFX9-NEXT: v_mov_b32_e32 v39, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; GFX9-NEXT: v_mov_b32_e32 v48, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v9 +; GFX9-NEXT: v_mov_b32_e32 v43, v8 +; GFX9-NEXT: v_mov_b32_e32 v44, v7 +; GFX9-NEXT: v_mov_b32_e32 v45, v6 +; GFX9-NEXT: v_mov_b32_e32 v46, v5 +; GFX9-NEXT: v_mov_b32_e32 v47, v4 +; GFX9-NEXT: v_mov_b32_e32 v56, v3 +; GFX9-NEXT: v_mov_b32_e32 v57, v2 +; GFX9-NEXT: v_mov_b32_e32 v58, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB34_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: .LBB34_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB34_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: s_movk_i32 s7, 0x200 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB34_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <56 x half> %a, splat (half 0xH0200) + %a2 = bitcast <56 x half> %a1 to <28 x float> + br label %end + +cmp.false: + %a3 = bitcast <56 x half> %a to <28 x float> + br label %end + +end: + %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x float> %phi +} + +define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56f16_to_v28f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v38, v12 +; SI-NEXT: v_or_b32_e32 v13, v36, v13 +; SI-NEXT: v_or_b32_e32 v14, v35, v14 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_or_b32_e32 v17, v37, v17 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v50, v27 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v52, v37 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_mov_b32_e32 v29, v37 +; SI-NEXT: v_mov_b32_e32 v37, v52 +; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v56f16_to_v28f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v32, v13 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v38, v7 +; VI-NEXT: v_mov_b32_e32 v39, v6 +; VI-NEXT: v_mov_b32_e32 v48, v5 +; VI-NEXT: v_mov_b32_e32 v49, v4 +; VI-NEXT: v_mov_b32_e32 v50, v3 +; VI-NEXT: v_mov_b32_e32 v51, v2 +; VI-NEXT: v_mov_b32_e32 v52, v1 +; VI-NEXT: v_mov_b32_e32 v53, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v27, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v26, v28, v26 +; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v56f16_to_v28f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v13 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v34, v11 +; GFX9-NEXT: v_mov_b32_e32 v35, v10 +; GFX9-NEXT: v_mov_b32_e32 v36, v9 +; GFX9-NEXT: v_mov_b32_e32 v37, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v7 +; GFX9-NEXT: v_mov_b32_e32 v39, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v5 +; GFX9-NEXT: v_mov_b32_e32 v49, v4 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v51, v2 +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v27 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v33, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v7 :: v_dual_mov_b32 v35, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v5 :: v_dual_mov_b32 v37, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_mov_b32 v49, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB35_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB35_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB35_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB35_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB35_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <56 x half> %a, splat (half 0xH0200) + %a2 = bitcast <56 x half> %a1 to <28 x float> + br label %end + +cmp.false: + %a3 = bitcast <56 x half> %a to <28 x float> + br label %end + +end: + %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x float> %phi +} + +define <14 x double> @bitcast_v14i64_to_v14f64(<14 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v14i64_to_v14f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14i64_to_v14f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14i64_to_v14f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14i64_to_v14f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i64> %a, splat (i64 3) + %a2 = bitcast <14 x i64> %a1 to <14 x double> + br label %end + +cmp.false: + %a3 = bitcast <14 x i64> %a to <14 x double> + br label %end + +end: + %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x double> %phi +} + +define inreg <14 x double> @bitcast_v14i64_to_v14f64_scalar(<14 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i64_to_v14f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 +; +; VI-LABEL: bitcast_v14i64_to_v14f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: .LBB37_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 +; +; GFX9-LABEL: bitcast_v14i64_to_v14f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: .LBB37_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 +; +; GFX11-LABEL: bitcast_v14i64_to_v14f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB37_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: .LBB37_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i64> %a, splat (i64 3) + %a2 = bitcast <14 x i64> %a1 to <14 x double> + br label %end + +cmp.false: + %a3 = bitcast <14 x i64> %a to <14 x double> + br label %end + +end: + %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x double> %phi +} + +define <14 x i64> @bitcast_v14f64_to_v14i64(<14 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v14f64_to_v14i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f64_to_v14i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f64_to_v14i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f64_to_v14i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: .LBB38_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <28 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <28 x float> %a1 to <56 x half> + %a1 = fadd <14 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <14 x double> %a1 to <14 x i64> br label %end cmp.false: - %a3 = bitcast <28 x float> %a to <56 x half> + %a3 = bitcast <14 x double> %a to <14 x i64> br label %end end: - %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <56 x half> %phi + %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i64> %phi } -define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v56f16_to_v28f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v49 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v35 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v34 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v31 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; GCN-NEXT: v_or_b32_e32 v0, v46, v0 -; GCN-NEXT: v_or_b32_e32 v1, v44, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; GCN-NEXT: v_or_b32_e32 v2, v42, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v51 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v28, v14 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: v_or_b32_e32 v16, v55, v16 -; GCN-NEXT: v_or_b32_e32 v17, v52, v17 -; GCN-NEXT: v_or_b32_e32 v18, v50, v18 -; GCN-NEXT: v_or_b32_e32 v19, v48, v19 -; GCN-NEXT: v_or_b32_e32 v20, v38, v20 -; GCN-NEXT: v_or_b32_e32 v21, v36, v21 -; GCN-NEXT: v_or_b32_e32 v22, v34, v22 -; GCN-NEXT: v_or_b32_e32 v23, v33, v23 -; GCN-NEXT: v_or_b32_e32 v24, v35, v24 -; GCN-NEXT: v_or_b32_e32 v25, v37, v25 -; GCN-NEXT: v_or_b32_e32 v26, v39, v26 -; GCN-NEXT: v_or_b32_e32 v27, v49, v27 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: .LBB17_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v44 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v41 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v55 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v52 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v50 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v48 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v38 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v36 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_or_b32_e32 v15, v17, v16 -; GCN-NEXT: v_or_b32_e32 v16, v19, v18 -; GCN-NEXT: v_or_b32_e32 v17, v21, v20 -; GCN-NEXT: v_or_b32_e32 v18, v23, v22 -; GCN-NEXT: v_or_b32_e32 v19, v25, v24 -; GCN-NEXT: v_or_b32_e32 v20, v27, v26 -; GCN-NEXT: v_or_b32_e32 v21, v29, v28 -; GCN-NEXT: v_or_b32_e32 v22, v31, v30 -; GCN-NEXT: v_or_b32_e32 v23, v33, v32 -; GCN-NEXT: v_or_b32_e32 v24, v35, v34 -; GCN-NEXT: v_or_b32_e32 v25, v37, v36 -; GCN-NEXT: v_or_b32_e32 v26, v39, v38 -; GCN-NEXT: v_or_b32_e32 v27, v49, v48 -; GCN-NEXT: .LBB17_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f64_to_v14i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v29, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: v_mov_b32_e32 v15, v29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 ; -; VI-LABEL: bitcast_v56f16_to_v28f32: +; VI-LABEL: bitcast_v14f64_to_v14i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v27 -; VI-NEXT: v_mov_b32_e32 v33, v26 -; VI-NEXT: v_mov_b32_e32 v34, v25 -; VI-NEXT: v_mov_b32_e32 v35, v24 -; VI-NEXT: v_mov_b32_e32 v36, v23 -; VI-NEXT: v_mov_b32_e32 v37, v22 -; VI-NEXT: v_mov_b32_e32 v38, v21 -; VI-NEXT: v_mov_b32_e32 v39, v20 -; VI-NEXT: v_mov_b32_e32 v48, v19 -; VI-NEXT: v_mov_b32_e32 v49, v18 -; VI-NEXT: v_mov_b32_e32 v50, v17 -; VI-NEXT: v_mov_b32_e32 v51, v16 -; VI-NEXT: v_mov_b32_e32 v52, v15 -; VI-NEXT: v_mov_b32_e32 v53, v14 -; VI-NEXT: v_mov_b32_e32 v54, v13 -; VI-NEXT: v_mov_b32_e32 v55, v12 -; VI-NEXT: v_mov_b32_e32 v40, v11 -; VI-NEXT: v_mov_b32_e32 v41, v10 -; VI-NEXT: v_mov_b32_e32 v42, v9 -; VI-NEXT: v_mov_b32_e32 v43, v8 -; VI-NEXT: v_mov_b32_e32 v44, v7 -; VI-NEXT: v_mov_b32_e32 v45, v6 -; VI-NEXT: v_mov_b32_e32 v46, v5 -; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v56, v3 -; VI-NEXT: v_mov_b32_e32 v57, v2 -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_mov_b32_e32 v59, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v29, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v27, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v27, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v27, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v27, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v27, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v27, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v27, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v27, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v27, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v27, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v27, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v27, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v46, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v45, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v44, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v43, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v41, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v53, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v52, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v36, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v35, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v34, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v33, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: v_mov_b32_e32 v15, v29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v14f64_to_v14i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v29, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: v_mov_b32_e32 v15, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v14f64_to_v14i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB39_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: .LBB39_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <14 x double> %a1 to <14 x i64> + br label %end + +cmp.false: + %a3 = bitcast <14 x double> %a to <14 x i64> + br label %end + +end: + %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i64> %phi +} + +define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v14i64_to_v56i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v36, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v38, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v48, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v53, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v36, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v38, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v48, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v53, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14i64_to_v56i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr45 @@ -10708,477 +25431,346 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB17_2: ; %Flow +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB40_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_4 +; VI-NEXT: s_cbranch_execz .LBB40_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v27, 0x200 -; VI-NEXT: v_add_f16_sdwa v0, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v59 -; VI-NEXT: v_add_f16_sdwa v2, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v58 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 -; VI-NEXT: v_add_f16_sdwa v2, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v57 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_add_f16_sdwa v3, v56, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v56 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_add_f16_sdwa v4, v47, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v47 -; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: v_add_f16_sdwa v5, v46, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v46 -; VI-NEXT: v_or_b32_e32 v5, v6, v5 -; VI-NEXT: v_add_f16_sdwa v6, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v45 -; VI-NEXT: v_or_b32_e32 v6, v7, v6 -; VI-NEXT: v_add_f16_sdwa v7, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v44 -; VI-NEXT: v_or_b32_e32 v7, v8, v7 -; VI-NEXT: v_add_f16_sdwa v8, v43, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v43 -; VI-NEXT: v_or_b32_e32 v8, v9, v8 -; VI-NEXT: v_add_f16_sdwa v9, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, 0x200, v42 -; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: v_add_f16_sdwa v10, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, 0x200, v41 -; VI-NEXT: v_or_b32_e32 v10, v11, v10 -; VI-NEXT: v_add_f16_sdwa v11, v40, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, 0x200, v40 -; VI-NEXT: v_or_b32_e32 v11, v12, v11 -; VI-NEXT: v_add_f16_sdwa v12, v55, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v13, 0x200, v55 -; VI-NEXT: v_or_b32_e32 v12, v13, v12 -; VI-NEXT: v_add_f16_sdwa v13, v54, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, 0x200, v54 -; VI-NEXT: v_or_b32_e32 v13, v14, v13 -; VI-NEXT: v_add_f16_sdwa v14, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v15, 0x200, v53 -; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: v_add_f16_sdwa v15, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v16, 0x200, v52 -; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: v_add_f16_sdwa v16, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v17, 0x200, v51 -; VI-NEXT: v_or_b32_e32 v16, v17, v16 -; VI-NEXT: v_add_f16_sdwa v17, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v50 -; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: v_add_f16_sdwa v18, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v19, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v18, v19, v18 -; VI-NEXT: v_add_f16_sdwa v19, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: v_add_f16_sdwa v20, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v21, 0x200, v39 -; VI-NEXT: v_or_b32_e32 v20, v21, v20 -; VI-NEXT: v_add_f16_sdwa v21, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v22, 0x200, v38 -; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: v_add_f16_sdwa v22, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v23, 0x200, v37 -; VI-NEXT: v_or_b32_e32 v22, v23, v22 -; VI-NEXT: v_add_f16_sdwa v23, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v24, 0x200, v36 -; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: v_add_f16_sdwa v24, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v25, 0x200, v35 -; VI-NEXT: v_or_b32_e32 v24, v25, v24 -; VI-NEXT: v_add_f16_sdwa v25, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v26, 0x200, v34 -; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: v_add_f16_sdwa v26, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v28, 0x200, v33 -; VI-NEXT: v_or_b32_e32 v26, v28, v26 -; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 -; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: .LBB17_4: ; %end +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB40_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v56f16_to_v28f32: +; GFX9-LABEL: bitcast_v14i64_to_v56i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v59, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v37, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v38, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; GFX9-NEXT: v_mov_b32_e32 v39, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GFX9-NEXT: v_mov_b32_e32 v48, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v9 -; GFX9-NEXT: v_mov_b32_e32 v43, v8 -; GFX9-NEXT: v_mov_b32_e32 v44, v7 -; GFX9-NEXT: v_mov_b32_e32 v45, v6 -; GFX9-NEXT: v_mov_b32_e32 v46, v5 -; GFX9-NEXT: v_mov_b32_e32 v47, v4 -; GFX9-NEXT: v_mov_b32_e32 v56, v3 -; GFX9-NEXT: v_mov_b32_e32 v57, v2 -; GFX9-NEXT: v_mov_b32_e32 v58, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: .LBB17_2: ; %Flow +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB40_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_4 +; GFX9-NEXT: s_cbranch_execz .LBB40_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: s_movk_i32 s7, 0x200 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_4: ; %end +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB40_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28f32: +; GFX11-TRUE16-LABEL: bitcast_v14i64_to_v56i16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -11186,989 +25778,2279 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB40_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28f32: +; GFX11-FAKE16-LABEL: bitcast_v14i64_to_v56i16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB40_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB40_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false -cmp.true: - %a1 = fadd <56 x half> %a, splat (half 0xH0200) - %a2 = bitcast <56 x half> %a1 to <28 x float> - br label %end - -cmp.false: - %a3 = bitcast <56 x half> %a to <28 x float> - br label %end - -end: - %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <28 x float> %phi -} - -define <14 x double> @bitcast_v14i64_to_v14f64(<14 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i64_to_v14f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v14i64_to_v14f64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; VI-NEXT: .LBB18_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v14i64_to_v14f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: .LBB18_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v14i64_to_v14f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v28 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-NEXT: .LBB18_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - cmp.true: %a1 = add <14 x i64> %a, splat (i64 3) - %a2 = bitcast <14 x i64> %a1 to <14 x double> + %a2 = bitcast <14 x i64> %a1 to <56 x i16> br label %end cmp.false: - %a3 = bitcast <14 x i64> %a to <14 x double> + %a3 = bitcast <14 x i64> %a to <56 x i16> br label %end end: - %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <14 x double> %phi + %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x i16> %phi } -define <14 x i64> @bitcast_v14f64_to_v14i64(<14 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f64_to_v14i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i64_to_v56i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-NEXT: v_readfirstlane_b32 s43, v1 +; SI-NEXT: v_readfirstlane_b32 s42, v2 +; SI-NEXT: v_readfirstlane_b32 s41, v3 +; SI-NEXT: v_readfirstlane_b32 s40, v4 +; SI-NEXT: v_readfirstlane_b32 s15, v5 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v10 +; SI-NEXT: v_readfirstlane_b32 s9, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v12 +; SI-NEXT: v_readfirstlane_b32 s7, v13 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s22 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s29, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s27, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s25, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s23, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s21, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s19, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s17, v14, 16 +; SI-NEXT: s_lshr_b32 s44, s6, 16 +; SI-NEXT: s_lshr_b32 s45, s8, 16 +; SI-NEXT: s_lshr_b32 s46, s10, 16 +; SI-NEXT: s_lshr_b32 s47, s12, 16 +; SI-NEXT: s_lshr_b32 s56, s14, 16 +; SI-NEXT: s_lshr_b32 s57, s40, 16 +; SI-NEXT: s_lshr_b32 s58, s42, 16 +; SI-NEXT: s_lshr_b32 s59, s29, 16 +; SI-NEXT: s_lshr_b32 s60, s27, 16 +; SI-NEXT: s_lshr_b32 s61, s25, 16 +; SI-NEXT: s_lshr_b32 s62, s23, 16 +; SI-NEXT: s_lshr_b32 s63, s21, 16 +; SI-NEXT: s_lshr_b32 s72, s19, 16 +; SI-NEXT: s_lshr_b32 s73, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s43, s43, 3 +; SI-NEXT: s_addc_u32 s42, s42, 0 +; SI-NEXT: s_add_u32 s41, s41, 3 +; SI-NEXT: s_addc_u32 s40, s40, 0 +; SI-NEXT: s_add_u32 s15, s15, 3 +; SI-NEXT: s_addc_u32 s14, s14, 0 +; SI-NEXT: s_add_u32 s13, s13, 3 +; SI-NEXT: s_addc_u32 s12, s12, 0 +; SI-NEXT: s_add_u32 s11, s11, 3 +; SI-NEXT: s_addc_u32 s10, s10, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s22 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s29, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s27, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s25, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s23, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s21, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s19, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s17, v14, 16 +; SI-NEXT: s_lshr_b32 s44, s6, 16 +; SI-NEXT: s_lshr_b32 s45, s8, 16 +; SI-NEXT: s_lshr_b32 s46, s10, 16 +; SI-NEXT: s_lshr_b32 s47, s12, 16 +; SI-NEXT: s_lshr_b32 s56, s14, 16 +; SI-NEXT: s_lshr_b32 s57, s40, 16 +; SI-NEXT: s_lshr_b32 s58, s42, 16 +; SI-NEXT: s_lshr_b32 s59, s29, 16 +; SI-NEXT: s_lshr_b32 s60, s27, 16 +; SI-NEXT: s_lshr_b32 s61, s25, 16 +; SI-NEXT: s_lshr_b32 s62, s23, 16 +; SI-NEXT: s_lshr_b32 s63, s21, 16 +; SI-NEXT: s_lshr_b32 s72, s19, 16 +; SI-NEXT: s_lshr_b32 s73, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v13, s4, v13 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v14, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s42, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: s_branch .LBB41_2 ; -; VI-LABEL: bitcast_v14f64_to_v14i64: +; VI-LABEL: bitcast_v14i64_to_v56i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: .LBB19_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v28, s30, 0 +; VI-NEXT: v_writelane_b32 v28, s31, 1 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_writelane_b32 v28, s34, 2 +; VI-NEXT: v_readfirstlane_b32 s43, v0 +; VI-NEXT: v_readfirstlane_b32 s42, v1 +; VI-NEXT: v_readfirstlane_b32 s41, v2 +; VI-NEXT: v_readfirstlane_b32 s40, v3 +; VI-NEXT: v_readfirstlane_b32 s15, v4 +; VI-NEXT: v_readfirstlane_b32 s14, v5 +; VI-NEXT: v_readfirstlane_b32 s13, v6 +; VI-NEXT: v_readfirstlane_b32 s12, v7 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s10, v9 +; VI-NEXT: v_readfirstlane_b32 s9, v10 +; VI-NEXT: v_readfirstlane_b32 s8, v11 +; VI-NEXT: v_readfirstlane_b32 s6, v12 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v13 +; VI-NEXT: v_writelane_b32 v28, s35, 3 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s44, s7, 16 +; VI-NEXT: s_lshr_b32 s45, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: s_lshr_b32 s56, s10, 16 +; VI-NEXT: s_lshr_b32 s57, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s12, 16 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 16 +; VI-NEXT: s_lshr_b32 s61, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s40, 16 +; VI-NEXT: s_lshr_b32 s63, s41, 16 +; VI-NEXT: s_lshr_b32 s72, s42, 16 +; VI-NEXT: s_lshr_b32 s73, s43, 16 +; VI-NEXT: s_lshr_b32 s74, s29, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s25, 16 +; VI-NEXT: s_lshr_b32 s79, s24, 16 +; VI-NEXT: s_lshr_b32 s88, s23, 16 +; VI-NEXT: s_lshr_b32 s89, s22, 16 +; VI-NEXT: s_lshr_b32 s90, s21, 16 +; VI-NEXT: s_lshr_b32 s91, s20, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 16 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s41, s41, 3 +; VI-NEXT: s_addc_u32 s40, s40, 0 +; VI-NEXT: s_add_u32 s43, s43, 3 +; VI-NEXT: s_addc_u32 s42, s42, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s44, s7, 16 +; VI-NEXT: s_lshr_b32 s45, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: s_lshr_b32 s56, s10, 16 +; VI-NEXT: s_lshr_b32 s57, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s12, 16 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 16 +; VI-NEXT: s_lshr_b32 s61, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s40, 16 +; VI-NEXT: s_lshr_b32 s63, s41, 16 +; VI-NEXT: s_lshr_b32 s72, s42, 16 +; VI-NEXT: s_lshr_b32 s73, s43, 16 +; VI-NEXT: s_lshr_b32 s74, s29, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s25, 16 +; VI-NEXT: s_lshr_b32 s79, s24, 16 +; VI-NEXT: s_lshr_b32 s88, s23, 16 +; VI-NEXT: s_lshr_b32 s89, s22, 16 +; VI-NEXT: s_lshr_b32 s90, s21, 16 +; VI-NEXT: s_lshr_b32 s91, s20, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 16 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s35, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s34, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s31, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s30, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s91, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s90, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s89, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s88, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s79, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s78, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s77, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s76, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s75, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s74, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s29, s73, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s42, s72, 16 +; VI-NEXT: s_or_b32 s29, s29, s42 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s42, s63, 16 +; VI-NEXT: s_or_b32 s41, s41, s42 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s42, s62, 16 +; VI-NEXT: s_or_b32 s40, s40, s42 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s42, s61, 16 +; VI-NEXT: s_or_b32 s15, s15, s42 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s42, s60, 16 +; VI-NEXT: s_or_b32 s14, s14, s42 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s42, s59, 16 +; VI-NEXT: s_or_b32 s13, s13, s42 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s42, s58, 16 +; VI-NEXT: s_or_b32 s12, s12, s42 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s42, s57, 16 +; VI-NEXT: s_or_b32 s11, s11, s42 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s42, s56, 16 +; VI-NEXT: s_or_b32 s10, s10, s42 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s42, s47, 16 +; VI-NEXT: s_or_b32 s9, s9, s42 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s42, s46, 16 +; VI-NEXT: s_or_b32 s8, s8, s42 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s42, s45, 16 +; VI-NEXT: s_or_b32 s6, s6, s42 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s42, s44, 16 +; VI-NEXT: s_or_b32 s7, s7, s42 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s41 +; VI-NEXT: v_mov_b32_e32 v17, s40 +; VI-NEXT: v_mov_b32_e32 v18, s15 +; VI-NEXT: v_mov_b32_e32 v19, s14 +; VI-NEXT: v_mov_b32_e32 v20, s13 +; VI-NEXT: v_mov_b32_e32 v21, s12 +; VI-NEXT: v_mov_b32_e32 v22, s11 +; VI-NEXT: v_mov_b32_e32 v23, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v25, s8 +; VI-NEXT: v_mov_b32_e32 v26, s6 +; VI-NEXT: v_mov_b32_e32 v27, s7 +; VI-NEXT: v_readlane_b32 s35, v28, 3 +; VI-NEXT: v_readlane_b32 s34, v28, 2 +; VI-NEXT: v_readlane_b32 s31, v28, 1 +; VI-NEXT: v_readlane_b32 s30, v28, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: s_branch .LBB41_2 ; -; GFX9-LABEL: bitcast_v14f64_to_v14i64: +; GFX9-LABEL: bitcast_v14i64_to_v56i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: v_readfirstlane_b32 s42, v12 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s43, v13 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s44, s43, 16 +; GFX9-NEXT: s_lshr_b32 s45, s42, 16 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_lshr_b32 s47, s40, 16 +; GFX9-NEXT: s_lshr_b32 s56, s15, 16 +; GFX9-NEXT: s_lshr_b32 s57, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s13, 16 +; GFX9-NEXT: s_lshr_b32 s59, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s11, 16 +; GFX9-NEXT: s_lshr_b32 s61, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s9, 16 +; GFX9-NEXT: s_lshr_b32 s63, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s28, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s25, 16 +; GFX9-NEXT: s_lshr_b32 s79, s24, 16 +; GFX9-NEXT: s_lshr_b32 s88, s23, 16 +; GFX9-NEXT: s_lshr_b32 s89, s22, 16 +; GFX9-NEXT: s_lshr_b32 s90, s21, 16 +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 16 +; GFX9-NEXT: s_lshr_b32 s93, s18, 16 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s42, s42, 3 +; GFX9-NEXT: s_addc_u32 s43, s43, 0 +; GFX9-NEXT: s_add_u32 s40, s40, 3 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s44, s43, 16 +; GFX9-NEXT: s_lshr_b32 s45, s42, 16 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_lshr_b32 s47, s40, 16 +; GFX9-NEXT: s_lshr_b32 s56, s15, 16 +; GFX9-NEXT: s_lshr_b32 s57, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s13, 16 +; GFX9-NEXT: s_lshr_b32 s59, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s11, 16 +; GFX9-NEXT: s_lshr_b32 s61, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s9, 16 +; GFX9-NEXT: s_lshr_b32 s63, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s28, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s25, 16 +; GFX9-NEXT: s_lshr_b32 s79, s24, 16 +; GFX9-NEXT: s_lshr_b32 s88, s23, 16 +; GFX9-NEXT: s_lshr_b32 s89, s22, 16 +; GFX9-NEXT: s_lshr_b32 s90, s21, 16 +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 16 +; GFX9-NEXT: s_lshr_b32 s93, s18, 16 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s44 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: s_branch .LBB41_2 ; -; GFX11-LABEL: bitcast_v14f64_to_v14i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v28 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v14i64_to_v56i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-TRUE16-NEXT: s_mov_b32 s90, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s10, s10, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s12, s12, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-TRUE16-NEXT: .LBB41_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s13, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s12, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s13 :: v_dual_mov_b32 v19, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s11 :: v_dual_mov_b32 v21, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s7 :: v_dual_mov_b32 v25, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v27, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB41_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB41_2 +; +; GFX11-FAKE16-LABEL: bitcast_v14i64_to_v56i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v9 +; GFX11-FAKE16-NEXT: s_mov_b32 s90, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-FAKE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s13, s13, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s12, s12, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-FAKE16-NEXT: .LBB41_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s13, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s12, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB41_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: s_branch .LBB41_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <14 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <14 x double> %a1 to <14 x i64> + %a1 = add <14 x i64> %a, splat (i64 3) + %a2 = bitcast <14 x i64> %a1 to <56 x i16> br label %end cmp.false: - %a3 = bitcast <14 x double> %a to <14 x i64> + %a3 = bitcast <14 x i64> %a to <56 x i16> br label %end end: - %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <14 x i64> %phi + %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x i16> %phi } -define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i64_to_v56i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v35, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v37, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v39, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v50, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v52, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v40, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v43, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v35, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v37, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v39, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v50, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v52, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v40, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v43, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_and_b32_e32 v60, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_and_b32_e32 v61, 0xffff, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v40 -; GCN-NEXT: v_or_b32_e32 v5, v57, v45 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v40, v58, v56 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v59, v43 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v43, v60, v47 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v47, v61, v62 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v45, v45, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v56, v56, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v44, v58, v44 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v9, v9, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v10, v10, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v11, v11, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v12, v12, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v13, v13, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v14, v14, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v15, v15, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v16, v16, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v17, v17, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v18, v18, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v19, v19, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v20, v20, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v21, v21, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v22, v22, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v23, v23, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v24, v24, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x6c, v0 -; GCN-NEXT: v_or_b32_e32 v25, v25, v30 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v29 -; GCN-NEXT: v_or_b32_e32 v28, v28, v34 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v56i16_to_v14i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v59 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_or_b32_e32 v2, v2, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v49 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v3, v3, v57 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v6, v6, v46 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v45 +; SI-NEXT: v_or_b32_e32 v9, v9, v44 +; SI-NEXT: v_or_b32_e32 v10, v10, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_or_b32_e32 v13, v13, v63 +; SI-NEXT: v_or_b32_e32 v14, v14, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v56 +; SI-NEXT: v_or_b32_e32 v18, v18, v47 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v20, v20, v36 +; SI-NEXT: v_or_b32_e32 v21, v21, v35 +; SI-NEXT: v_or_b32_e32 v22, v22, v43 +; SI-NEXT: v_or_b32_e32 v23, v23, v42 +; SI-NEXT: v_or_b32_e32 v24, v24, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v40 +; SI-NEXT: v_or_b32_e32 v26, v26, v62 +; SI-NEXT: v_or_b32_e32 v27, v27, v60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v16, v49, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v3, v57, v3 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: v_or_b32_e32 v9, v44, v9 +; SI-NEXT: v_or_b32_e32 v10, v34, v10 +; SI-NEXT: v_or_b32_e32 v11, v33, v11 +; SI-NEXT: v_or_b32_e32 v12, v32, v12 +; SI-NEXT: v_or_b32_e32 v13, v63, v13 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: v_or_b32_e32 v15, v50, v15 +; SI-NEXT: v_or_b32_e32 v17, v56, v17 +; SI-NEXT: v_or_b32_e32 v18, v47, v18 +; SI-NEXT: v_or_b32_e32 v19, v38, v19 +; SI-NEXT: v_or_b32_e32 v20, v36, v20 +; SI-NEXT: v_or_b32_e32 v21, v35, v21 +; SI-NEXT: v_or_b32_e32 v22, v43, v22 +; SI-NEXT: v_or_b32_e32 v23, v42, v23 +; SI-NEXT: v_or_b32_e32 v24, v41, v24 +; SI-NEXT: v_or_b32_e32 v25, v40, v25 +; SI-NEXT: v_or_b32_e32 v26, v62, v26 +; SI-NEXT: v_or_b32_e32 v27, v60, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v14i64_to_v56i16: +; VI-LABEL: bitcast_v56i16_to_v14i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v27 +; VI-NEXT: v_mov_b32_e32 v33, v26 +; VI-NEXT: v_mov_b32_e32 v34, v25 +; VI-NEXT: v_mov_b32_e32 v35, v24 +; VI-NEXT: v_mov_b32_e32 v36, v23 +; VI-NEXT: v_mov_b32_e32 v37, v22 +; VI-NEXT: v_mov_b32_e32 v38, v21 +; VI-NEXT: v_mov_b32_e32 v39, v20 +; VI-NEXT: v_mov_b32_e32 v48, v19 +; VI-NEXT: v_mov_b32_e32 v49, v18 +; VI-NEXT: v_mov_b32_e32 v50, v17 +; VI-NEXT: v_mov_b32_e32 v51, v16 +; VI-NEXT: v_mov_b32_e32 v52, v15 +; VI-NEXT: v_mov_b32_e32 v53, v14 +; VI-NEXT: v_mov_b32_e32 v54, v13 +; VI-NEXT: v_mov_b32_e32 v55, v12 +; VI-NEXT: v_mov_b32_e32 v40, v11 +; VI-NEXT: v_mov_b32_e32 v41, v10 +; VI-NEXT: v_mov_b32_e32 v42, v9 +; VI-NEXT: v_mov_b32_e32 v43, v8 +; VI-NEXT: v_mov_b32_e32 v44, v7 +; VI-NEXT: v_mov_b32_e32 v45, v6 +; VI-NEXT: v_mov_b32_e32 v46, v5 +; VI-NEXT: v_mov_b32_e32 v47, v4 +; VI-NEXT: v_mov_b32_e32 v56, v3 +; VI-NEXT: v_mov_b32_e32 v57, v2 +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_mov_b32_e32 v59, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB20_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB20_4: ; %end +; VI-NEXT: v_mov_b32_e32 v27, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v27, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v27, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v27, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v27, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v27, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v27, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v27, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v27, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v27, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v27, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v27, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v46, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v45, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v44, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v43, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v41, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v53, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v36, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v35, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v34, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: .LBB42_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB42_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v27, 3 +; VI-NEXT: v_add_u16_e32 v0, 3, v59 +; VI-NEXT: v_add_u16_sdwa v1, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 3, v58 +; VI-NEXT: v_add_u16_sdwa v3, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v57 +; VI-NEXT: v_add_u16_sdwa v3, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v56 +; VI-NEXT: v_add_u16_sdwa v4, v56, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v47 +; VI-NEXT: v_add_u16_sdwa v5, v47, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v46 +; VI-NEXT: v_add_u16_sdwa v6, v46, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v45 +; VI-NEXT: v_add_u16_sdwa v7, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u16_e32 v7, 3, v44 +; VI-NEXT: v_add_u16_sdwa v8, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v43 +; VI-NEXT: v_add_u16_sdwa v9, v43, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v42 +; VI-NEXT: v_add_u16_sdwa v10, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v41 +; VI-NEXT: v_add_u16_sdwa v11, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v40 +; VI-NEXT: v_add_u16_sdwa v12, v40, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v55 +; VI-NEXT: v_add_u16_sdwa v13, v55, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v54 +; VI-NEXT: v_add_u16_sdwa v14, v54, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v53 +; VI-NEXT: v_add_u16_sdwa v15, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: v_add_u16_e32 v15, 3, v52 +; VI-NEXT: v_add_u16_sdwa v16, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: v_add_u16_e32 v16, 3, v51 +; VI-NEXT: v_add_u16_sdwa v17, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: v_add_u16_e32 v17, 3, v50 +; VI-NEXT: v_add_u16_sdwa v18, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_add_u16_e32 v18, 3, v49 +; VI-NEXT: v_add_u16_sdwa v19, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: v_add_u16_e32 v19, 3, v48 +; VI-NEXT: v_add_u16_sdwa v20, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_add_u16_e32 v20, 3, v39 +; VI-NEXT: v_add_u16_sdwa v21, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: v_add_u16_e32 v21, 3, v38 +; VI-NEXT: v_add_u16_sdwa v22, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v21, v22 +; VI-NEXT: v_add_u16_e32 v22, 3, v37 +; VI-NEXT: v_add_u16_sdwa v23, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v22, v23 +; VI-NEXT: v_add_u16_e32 v23, 3, v36 +; VI-NEXT: v_add_u16_sdwa v24, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v23, v24 +; VI-NEXT: v_add_u16_e32 v24, 3, v35 +; VI-NEXT: v_add_u16_sdwa v25, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v24, v25 +; VI-NEXT: v_add_u16_e32 v25, 3, v34 +; VI-NEXT: v_add_u16_sdwa v26, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v25, v26 +; VI-NEXT: v_add_u16_e32 v26, 3, v33 +; VI-NEXT: v_add_u16_sdwa v28, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v26, v28 +; VI-NEXT: v_add_u16_e32 v28, 3, v32 +; VI-NEXT: v_add_u16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v14i64_to_v56i16: +; GFX9-LABEL: bitcast_v56i16_to_v14i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v59, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v37, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v38, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; GFX9-NEXT: v_mov_b32_e32 v39, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; GFX9-NEXT: v_mov_b32_e32 v48, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v9 +; GFX9-NEXT: v_mov_b32_e32 v43, v8 +; GFX9-NEXT: v_mov_b32_e32 v44, v7 +; GFX9-NEXT: v_mov_b32_e32 v45, v6 +; GFX9-NEXT: v_mov_b32_e32 v46, v5 +; GFX9-NEXT: v_mov_b32_e32 v47, v4 +; GFX9-NEXT: v_mov_b32_e32 v56, v3 +; GFX9-NEXT: v_mov_b32_e32 v57, v2 +; GFX9-NEXT: v_mov_b32_e32 v58, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr45 @@ -12188,150 +28070,207 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 ; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB20_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB20_4: ; %end +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: .LBB42_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB42_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB42_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v14i64_to_v56i16: +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14i64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -12339,855 +28278,2014 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v14i64_to_v56i16: +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14i64: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <14 x i64> %a, splat (i64 3) - %a2 = bitcast <14 x i64> %a1 to <56 x i16> + %a1 = add <56 x i16> %a, splat (i16 3) + %a2 = bitcast <56 x i16> %a1 to <14 x i64> + br label %end + +cmp.false: + %a3 = bitcast <56 x i16> %a to <14 x i64> + br label %end + +end: + %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i64> %phi +} + +define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56i16_to_v14i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 +; SI-NEXT: v_mov_b32_e32 v32, v26 +; SI-NEXT: v_mov_b32_e32 v33, v24 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v12 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v12, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v13, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v14, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v16, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v17, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v8, v1, v18 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v27, v0, v63 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v41 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mov_b32_e32 v41, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: v_mov_b32_e32 v42, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v60 +; SI-NEXT: v_mov_b32_e32 v60, v57 +; SI-NEXT: v_mov_b32_e32 v57, v46 +; SI-NEXT: v_mov_b32_e32 v46, v61 +; SI-NEXT: v_mov_b32_e32 v61, v58 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v31, v47 +; SI-NEXT: v_mov_b32_e32 v47, v58 +; SI-NEXT: v_mov_b32_e32 v58, v61 +; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v46, v57 +; SI-NEXT: v_mov_b32_e32 v57, v60 +; SI-NEXT: v_mov_b32_e32 v60, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v48 +; SI-NEXT: v_mov_b32_e32 v48, v42 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v33, v36 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: v_mov_b32_e32 v56, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v56i16_to_v14i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v39, v13 +; VI-NEXT: v_mov_b32_e32 v37, v12 +; VI-NEXT: v_mov_b32_e32 v35, v11 +; VI-NEXT: v_mov_b32_e32 v34, v10 +; VI-NEXT: v_mov_b32_e32 v33, v9 +; VI-NEXT: v_mov_b32_e32 v32, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v38, v6 +; VI-NEXT: v_mov_b32_e32 v48, v5 +; VI-NEXT: v_mov_b32_e32 v49, v4 +; VI-NEXT: v_mov_b32_e32 v50, v3 +; VI-NEXT: v_mov_b32_e32 v51, v2 +; VI-NEXT: v_mov_b32_e32 v53, v1 +; VI-NEXT: v_mov_b32_e32 v52, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: v_or_b32_sdwa v14, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_or_b32_sdwa v20, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v22, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v23, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v24, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v56i16_to_v14i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v13 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v34, v11 +; GFX9-NEXT: v_mov_b32_e32 v35, v10 +; GFX9-NEXT: v_mov_b32_e32 v36, v9 +; GFX9-NEXT: v_mov_b32_e32 v37, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v7 +; GFX9-NEXT: v_mov_b32_e32 v39, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v5 +; GFX9-NEXT: v_mov_b32_e32 v49, v4 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v51, v2 +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v27 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v33, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v7 :: v_dual_mov_b32 v35, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v5 :: v_dual_mov_b32 v37, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_mov_b32 v49, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB43_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB43_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <56 x i16> %a, splat (i16 3) + %a2 = bitcast <56 x i16> %a1 to <14 x i64> br label %end cmp.false: - %a3 = bitcast <14 x i64> %a to <56 x i16> + %a3 = bitcast <56 x i16> %a to <14 x i64> br label %end end: - %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <56 x i16> %phi + %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i64> %phi } -define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v56i16_to_v14i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v41 -; GCN-NEXT: v_or_b32_e32 v1, v1, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v63 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v62 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v61 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v60 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v59 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v58 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v56 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v47 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v11, v11, v46 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v44 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v43 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v57 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v28 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v57 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v0, v41, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_or_b32_e32 v2, v63, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v62, v4 -; GCN-NEXT: v_or_b32_e32 v5, v61, v5 -; GCN-NEXT: v_or_b32_e32 v6, v60, v6 -; GCN-NEXT: v_or_b32_e32 v7, v59, v7 -; GCN-NEXT: v_or_b32_e32 v8, v58, v8 -; GCN-NEXT: v_or_b32_e32 v9, v56, v9 -; GCN-NEXT: v_or_b32_e32 v10, v47, v10 -; GCN-NEXT: v_or_b32_e32 v11, v46, v11 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v28, v12 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v28, v13 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v28, v14 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v28, v16 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v28, v17 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v28, v18 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v28, v19 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v28, v20 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v28, v21 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v28, v22 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v28, v23 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v28, v24 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v28, v25 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v28, v26 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v14i64_to_v56f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_mov_b32_e32 v29, v28 +; SI-NEXT: v_mov_b32_e32 v57, v25 +; SI-NEXT: v_mov_b32_e32 v47, v26 +; SI-NEXT: v_mov_b32_e32 v45, v27 +; SI-NEXT: v_mov_b32_e32 v43, v1 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v56i16_to_v14i64: +; VI-LABEL: bitcast_v14i64_to_v56f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v27 -; VI-NEXT: v_mov_b32_e32 v33, v26 -; VI-NEXT: v_mov_b32_e32 v34, v25 -; VI-NEXT: v_mov_b32_e32 v35, v24 -; VI-NEXT: v_mov_b32_e32 v36, v23 -; VI-NEXT: v_mov_b32_e32 v37, v22 -; VI-NEXT: v_mov_b32_e32 v38, v21 -; VI-NEXT: v_mov_b32_e32 v39, v20 -; VI-NEXT: v_mov_b32_e32 v48, v19 -; VI-NEXT: v_mov_b32_e32 v49, v18 -; VI-NEXT: v_mov_b32_e32 v50, v17 -; VI-NEXT: v_mov_b32_e32 v51, v16 -; VI-NEXT: v_mov_b32_e32 v52, v15 -; VI-NEXT: v_mov_b32_e32 v53, v14 -; VI-NEXT: v_mov_b32_e32 v54, v13 -; VI-NEXT: v_mov_b32_e32 v55, v12 -; VI-NEXT: v_mov_b32_e32 v40, v11 -; VI-NEXT: v_mov_b32_e32 v41, v10 -; VI-NEXT: v_mov_b32_e32 v42, v9 -; VI-NEXT: v_mov_b32_e32 v43, v8 -; VI-NEXT: v_mov_b32_e32 v44, v7 -; VI-NEXT: v_mov_b32_e32 v45, v6 -; VI-NEXT: v_mov_b32_e32 v46, v5 -; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v56, v3 -; VI-NEXT: v_mov_b32_e32 v57, v2 -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_mov_b32_e32 v59, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v27, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v27, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v27, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v27, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v27, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v27, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v27, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v27, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v27, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v27, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v27, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v27, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v46, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v45, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v44, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v43, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v41, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v53, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v52, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v36, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v35, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v34, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v33, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr45 @@ -13212,476 +30310,346 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB21_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v27, 3 -; VI-NEXT: v_add_u16_e32 v0, 3, v59 -; VI-NEXT: v_add_u16_sdwa v1, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v58 -; VI-NEXT: v_add_u16_sdwa v3, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: v_add_u16_e32 v2, 3, v57 -; VI-NEXT: v_add_u16_sdwa v3, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v3, 3, v56 -; VI-NEXT: v_add_u16_sdwa v4, v56, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_add_u16_e32 v4, 3, v47 -; VI-NEXT: v_add_u16_sdwa v5, v47, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_add_u16_e32 v5, 3, v46 -; VI-NEXT: v_add_u16_sdwa v6, v46, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v5, v6 -; VI-NEXT: v_add_u16_e32 v6, 3, v45 -; VI-NEXT: v_add_u16_sdwa v7, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_add_u16_e32 v7, 3, v44 -; VI-NEXT: v_add_u16_sdwa v8, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v43 -; VI-NEXT: v_add_u16_sdwa v9, v43, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v42 -; VI-NEXT: v_add_u16_sdwa v10, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v9, v10 -; VI-NEXT: v_add_u16_e32 v10, 3, v41 -; VI-NEXT: v_add_u16_sdwa v11, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v40 -; VI-NEXT: v_add_u16_sdwa v12, v40, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v55 -; VI-NEXT: v_add_u16_sdwa v13, v55, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v12, v13 -; VI-NEXT: v_add_u16_e32 v13, 3, v54 -; VI-NEXT: v_add_u16_sdwa v14, v54, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v53 -; VI-NEXT: v_add_u16_sdwa v15, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: v_add_u16_e32 v15, 3, v52 -; VI-NEXT: v_add_u16_sdwa v16, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: v_add_u16_e32 v16, 3, v51 -; VI-NEXT: v_add_u16_sdwa v17, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v16, v16, v17 -; VI-NEXT: v_add_u16_e32 v17, 3, v50 -; VI-NEXT: v_add_u16_sdwa v18, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v17, v17, v18 -; VI-NEXT: v_add_u16_e32 v18, 3, v49 -; VI-NEXT: v_add_u16_sdwa v19, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v18, v18, v19 -; VI-NEXT: v_add_u16_e32 v19, 3, v48 -; VI-NEXT: v_add_u16_sdwa v20, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: v_add_u16_e32 v20, 3, v39 -; VI-NEXT: v_add_u16_sdwa v21, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v20, v20, v21 -; VI-NEXT: v_add_u16_e32 v21, 3, v38 -; VI-NEXT: v_add_u16_sdwa v22, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: v_add_u16_e32 v22, 3, v37 -; VI-NEXT: v_add_u16_sdwa v23, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v22, v22, v23 -; VI-NEXT: v_add_u16_e32 v23, 3, v36 -; VI-NEXT: v_add_u16_sdwa v24, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v23, v23, v24 -; VI-NEXT: v_add_u16_e32 v24, 3, v35 -; VI-NEXT: v_add_u16_sdwa v25, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: v_add_u16_e32 v25, 3, v34 -; VI-NEXT: v_add_u16_sdwa v26, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v25, v25, v26 -; VI-NEXT: v_add_u16_e32 v26, 3, v33 -; VI-NEXT: v_add_u16_sdwa v28, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v26, v26, v28 -; VI-NEXT: v_add_u16_e32 v28, 3, v32 -; VI-NEXT: v_add_u16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: .LBB21_4: ; %end +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB44_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB44_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB44_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB44_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v56i16_to_v14i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v59, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v37, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v38, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; GFX9-NEXT: v_mov_b32_e32 v39, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GFX9-NEXT: v_mov_b32_e32 v48, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v9 -; GFX9-NEXT: v_mov_b32_e32 v43, v8 -; GFX9-NEXT: v_mov_b32_e32 v44, v7 -; GFX9-NEXT: v_mov_b32_e32 v45, v6 -; GFX9-NEXT: v_mov_b32_e32 v46, v5 -; GFX9-NEXT: v_mov_b32_e32 v47, v4 -; GFX9-NEXT: v_mov_b32_e32 v56, v3 -; GFX9-NEXT: v_mov_b32_e32 v57, v2 -; GFX9-NEXT: v_mov_b32_e32 v58, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14i64_to_v56f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: .LBB21_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_4: ; %end +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB44_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB44_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB44_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB44_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14i64: +; GFX11-TRUE16-LABEL: bitcast_v14i64_to_v56f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -13689,995 +30657,2637 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14i64: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB44_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v14i64_to_v56f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB44_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB44_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i64> %a, splat (i64 3) + %a2 = bitcast <14 x i64> %a1 to <56 x half> + br label %end + +cmp.false: + %a3 = bitcast <14 x i64> %a to <56 x half> + br label %end + +end: + %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x half> %phi +} + +define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i64_to_v56f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-NEXT: v_readfirstlane_b32 s42, v1 +; SI-NEXT: v_readfirstlane_b32 s43, v2 +; SI-NEXT: v_readfirstlane_b32 s40, v3 +; SI-NEXT: v_readfirstlane_b32 s41, v4 +; SI-NEXT: v_readfirstlane_b32 s14, v5 +; SI-NEXT: v_readfirstlane_b32 s15, v6 +; SI-NEXT: v_readfirstlane_b32 s12, v7 +; SI-NEXT: v_readfirstlane_b32 s13, v8 +; SI-NEXT: v_readfirstlane_b32 s10, v9 +; SI-NEXT: v_readfirstlane_b32 s11, v10 +; SI-NEXT: v_readfirstlane_b32 s7, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v12 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s16, s4, 16 +; SI-NEXT: s_lshr_b32 s17, s5, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s44, s18, 16 +; SI-NEXT: s_lshr_b32 s45, s19, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s46, s20, 16 +; SI-NEXT: s_lshr_b32 s47, s21, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s56, s22, 16 +; SI-NEXT: s_lshr_b32 s57, s23, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s58, s24, 16 +; SI-NEXT: s_lshr_b32 s59, s25, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s60, s26, 16 +; SI-NEXT: s_lshr_b32 s61, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s62, s28, 16 +; SI-NEXT: s_lshr_b32 s63, s29, 16 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_lshr_b32 s72, s42, 16 +; SI-NEXT: s_lshr_b32 s73, s43, 16 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_lshr_b32 s74, s40, 16 +; SI-NEXT: s_lshr_b32 s75, s41, 16 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_lshr_b32 s76, s14, 16 +; SI-NEXT: s_lshr_b32 s77, s15, 16 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_lshr_b32 s78, s12, 16 +; SI-NEXT: s_lshr_b32 s79, s13, 16 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_lshr_b32 s88, s10, 16 +; SI-NEXT: s_lshr_b32 s89, s11, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s90, s7, 16 +; SI-NEXT: s_lshr_b32 s91, s8, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_lshr_b32 s92, s6, 16 +; SI-NEXT: s_lshr_b32 s93, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v45, s5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, s16 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_or_b32_e32 v47, v47, v56 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: buffer_store_dword v47, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v47, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v45, v45, v46 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: buffer_store_dword v45, v47, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v45, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v43, v43, v44 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v43, v45, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v43, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v41, v41, v42 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v41, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v55, v40, v55 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v53, v54, v53 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v51, v52, v51 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v49, v50, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v32, v34, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v18, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v15, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v14i64_to_v56f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v28, s30, 0 +; VI-NEXT: v_writelane_b32 v28, s31, 1 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_writelane_b32 v28, s34, 2 +; VI-NEXT: v_readfirstlane_b32 s43, v0 +; VI-NEXT: v_readfirstlane_b32 s42, v1 +; VI-NEXT: v_readfirstlane_b32 s41, v2 +; VI-NEXT: v_readfirstlane_b32 s40, v3 +; VI-NEXT: v_readfirstlane_b32 s15, v4 +; VI-NEXT: v_readfirstlane_b32 s14, v5 +; VI-NEXT: v_readfirstlane_b32 s13, v6 +; VI-NEXT: v_readfirstlane_b32 s12, v7 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s10, v9 +; VI-NEXT: v_readfirstlane_b32 s9, v10 +; VI-NEXT: v_readfirstlane_b32 s8, v11 +; VI-NEXT: v_readfirstlane_b32 s6, v12 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v13 +; VI-NEXT: v_writelane_b32 v28, s35, 3 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s44, s7, 16 +; VI-NEXT: s_lshr_b32 s45, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: s_lshr_b32 s56, s10, 16 +; VI-NEXT: s_lshr_b32 s57, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s12, 16 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 16 +; VI-NEXT: s_lshr_b32 s61, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s40, 16 +; VI-NEXT: s_lshr_b32 s63, s41, 16 +; VI-NEXT: s_lshr_b32 s72, s42, 16 +; VI-NEXT: s_lshr_b32 s73, s43, 16 +; VI-NEXT: s_lshr_b32 s74, s29, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s25, 16 +; VI-NEXT: s_lshr_b32 s79, s24, 16 +; VI-NEXT: s_lshr_b32 s88, s23, 16 +; VI-NEXT: s_lshr_b32 s89, s22, 16 +; VI-NEXT: s_lshr_b32 s90, s21, 16 +; VI-NEXT: s_lshr_b32 s91, s20, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 16 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s41, s41, 3 +; VI-NEXT: s_addc_u32 s40, s40, 0 +; VI-NEXT: s_add_u32 s43, s43, 3 +; VI-NEXT: s_addc_u32 s42, s42, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s44, s7, 16 +; VI-NEXT: s_lshr_b32 s45, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: s_lshr_b32 s56, s10, 16 +; VI-NEXT: s_lshr_b32 s57, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s12, 16 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 16 +; VI-NEXT: s_lshr_b32 s61, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s40, 16 +; VI-NEXT: s_lshr_b32 s63, s41, 16 +; VI-NEXT: s_lshr_b32 s72, s42, 16 +; VI-NEXT: s_lshr_b32 s73, s43, 16 +; VI-NEXT: s_lshr_b32 s74, s29, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s25, 16 +; VI-NEXT: s_lshr_b32 s79, s24, 16 +; VI-NEXT: s_lshr_b32 s88, s23, 16 +; VI-NEXT: s_lshr_b32 s89, s22, 16 +; VI-NEXT: s_lshr_b32 s90, s21, 16 +; VI-NEXT: s_lshr_b32 s91, s20, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 16 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s35, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s34, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s31, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s30, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s91, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s90, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s89, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s88, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s79, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s78, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s77, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s76, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s75, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s74, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s29, s73, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s42, s72, 16 +; VI-NEXT: s_or_b32 s29, s29, s42 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s42, s63, 16 +; VI-NEXT: s_or_b32 s41, s41, s42 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s42, s62, 16 +; VI-NEXT: s_or_b32 s40, s40, s42 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s42, s61, 16 +; VI-NEXT: s_or_b32 s15, s15, s42 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s42, s60, 16 +; VI-NEXT: s_or_b32 s14, s14, s42 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s42, s59, 16 +; VI-NEXT: s_or_b32 s13, s13, s42 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s42, s58, 16 +; VI-NEXT: s_or_b32 s12, s12, s42 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s42, s57, 16 +; VI-NEXT: s_or_b32 s11, s11, s42 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s42, s56, 16 +; VI-NEXT: s_or_b32 s10, s10, s42 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s42, s47, 16 +; VI-NEXT: s_or_b32 s9, s9, s42 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s42, s46, 16 +; VI-NEXT: s_or_b32 s8, s8, s42 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s42, s45, 16 +; VI-NEXT: s_or_b32 s6, s6, s42 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s42, s44, 16 +; VI-NEXT: s_or_b32 s7, s7, s42 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s41 +; VI-NEXT: v_mov_b32_e32 v17, s40 +; VI-NEXT: v_mov_b32_e32 v18, s15 +; VI-NEXT: v_mov_b32_e32 v19, s14 +; VI-NEXT: v_mov_b32_e32 v20, s13 +; VI-NEXT: v_mov_b32_e32 v21, s12 +; VI-NEXT: v_mov_b32_e32 v22, s11 +; VI-NEXT: v_mov_b32_e32 v23, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v25, s8 +; VI-NEXT: v_mov_b32_e32 v26, s6 +; VI-NEXT: v_mov_b32_e32 v27, s7 +; VI-NEXT: v_readlane_b32 s35, v28, 3 +; VI-NEXT: v_readlane_b32 s34, v28, 2 +; VI-NEXT: v_readlane_b32 s31, v28, 1 +; VI-NEXT: v_readlane_b32 s30, v28, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v14i64_to_v56f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: v_readfirstlane_b32 s42, v12 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s43, v13 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s44, s43, 16 +; GFX9-NEXT: s_lshr_b32 s45, s42, 16 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_lshr_b32 s47, s40, 16 +; GFX9-NEXT: s_lshr_b32 s56, s15, 16 +; GFX9-NEXT: s_lshr_b32 s57, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s13, 16 +; GFX9-NEXT: s_lshr_b32 s59, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s11, 16 +; GFX9-NEXT: s_lshr_b32 s61, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s9, 16 +; GFX9-NEXT: s_lshr_b32 s63, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s28, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s25, 16 +; GFX9-NEXT: s_lshr_b32 s79, s24, 16 +; GFX9-NEXT: s_lshr_b32 s88, s23, 16 +; GFX9-NEXT: s_lshr_b32 s89, s22, 16 +; GFX9-NEXT: s_lshr_b32 s90, s21, 16 +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 16 +; GFX9-NEXT: s_lshr_b32 s93, s18, 16 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s42, s42, 3 +; GFX9-NEXT: s_addc_u32 s43, s43, 0 +; GFX9-NEXT: s_add_u32 s40, s40, 3 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s44, s43, 16 +; GFX9-NEXT: s_lshr_b32 s45, s42, 16 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_lshr_b32 s47, s40, 16 +; GFX9-NEXT: s_lshr_b32 s56, s15, 16 +; GFX9-NEXT: s_lshr_b32 s57, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s13, 16 +; GFX9-NEXT: s_lshr_b32 s59, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s11, 16 +; GFX9-NEXT: s_lshr_b32 s61, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s9, 16 +; GFX9-NEXT: s_lshr_b32 s63, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s28, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s25, 16 +; GFX9-NEXT: s_lshr_b32 s79, s24, 16 +; GFX9-NEXT: s_lshr_b32 s88, s23, 16 +; GFX9-NEXT: s_lshr_b32 s89, s22, 16 +; GFX9-NEXT: s_lshr_b32 s90, s21, 16 +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 16 +; GFX9-NEXT: s_lshr_b32 s93, s18, 16 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s44 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-TRUE16-LABEL: bitcast_v14i64_to_v56f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-TRUE16-NEXT: s_mov_b32 s90, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s10, s10, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s12, s12, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-TRUE16-NEXT: .LBB45_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s13, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s12, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s13 :: v_dual_mov_b32 v19, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s11 :: v_dual_mov_b32 v21, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s7 :: v_dual_mov_b32 v25, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v27, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB45_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB45_2 +; +; GFX11-FAKE16-LABEL: bitcast_v14i64_to_v56f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v9 +; GFX11-FAKE16-NEXT: s_mov_b32 s90, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-FAKE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s13, s13, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s12, s12, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-FAKE16-NEXT: .LBB45_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s13, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s12, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB45_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: s_branch .LBB45_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <56 x i16> %a, splat (i16 3) - %a2 = bitcast <56 x i16> %a1 to <14 x i64> + %a1 = add <14 x i64> %a, splat (i64 3) + %a2 = bitcast <14 x i64> %a1 to <56 x half> br label %end cmp.false: - %a3 = bitcast <56 x i16> %a to <14 x i64> + %a3 = bitcast <14 x i64> %a to <56 x half> br label %end end: - %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <14 x i64> %phi + %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x half> %phi } -define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i64_to_v56f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v40 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v44 -; GCN-NEXT: v_mov_b32_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v46 -; GCN-NEXT: v_mov_b32_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v56 -; GCN-NEXT: v_mov_b32_e32 v56, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v58 -; GCN-NEXT: v_mov_b32_e32 v58, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v60 -; GCN-NEXT: v_mov_b32_e32 v60, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v62 -; GCN-NEXT: v_mov_b32_e32 v62, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_mov_b32_e32 v43, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v45 -; GCN-NEXT: v_mov_b32_e32 v45, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v47 -; GCN-NEXT: v_mov_b32_e32 v47, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v57 -; GCN-NEXT: v_mov_b32_e32 v57, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v59 -; GCN-NEXT: v_mov_b32_e32 v59, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v61 -; GCN-NEXT: v_mov_b32_e32 v61, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v63 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v39 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v37 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v5, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v7, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v49 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v48 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v38 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v36 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v34 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v21, v21, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v32 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v23, v23, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v25, v25, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v61 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v59 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v57 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v47 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v45 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v43 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v42 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v44 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v46 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v56 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v58 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x68, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v60 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x6c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v62 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_or_b32_e32 v45, v47, v46 -; GCN-NEXT: v_or_b32_e32 v46, v57, v56 -; GCN-NEXT: v_or_b32_e32 v47, v59, v58 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v56f16_to_v14i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_or_b32_e32 v21, v60, v21 +; SI-NEXT: v_or_b32_e32 v22, v58, v22 +; SI-NEXT: v_or_b32_e32 v23, v48, v23 +; SI-NEXT: v_or_b32_e32 v24, v38, v24 +; SI-NEXT: v_or_b32_e32 v25, v36, v25 +; SI-NEXT: v_or_b32_e32 v26, v34, v26 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v56 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63 +; SI-NEXT: v_or_b32_e32 v20, v62, v20 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v14i64_to_v56f16: +; VI-LABEL: bitcast_v56f16_to_v14i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v27 +; VI-NEXT: v_mov_b32_e32 v33, v26 +; VI-NEXT: v_mov_b32_e32 v34, v25 +; VI-NEXT: v_mov_b32_e32 v35, v24 +; VI-NEXT: v_mov_b32_e32 v36, v23 +; VI-NEXT: v_mov_b32_e32 v37, v22 +; VI-NEXT: v_mov_b32_e32 v38, v21 +; VI-NEXT: v_mov_b32_e32 v39, v20 +; VI-NEXT: v_mov_b32_e32 v48, v19 +; VI-NEXT: v_mov_b32_e32 v49, v18 +; VI-NEXT: v_mov_b32_e32 v50, v17 +; VI-NEXT: v_mov_b32_e32 v51, v16 +; VI-NEXT: v_mov_b32_e32 v52, v15 +; VI-NEXT: v_mov_b32_e32 v53, v14 +; VI-NEXT: v_mov_b32_e32 v54, v13 +; VI-NEXT: v_mov_b32_e32 v55, v12 +; VI-NEXT: v_mov_b32_e32 v40, v11 +; VI-NEXT: v_mov_b32_e32 v41, v10 +; VI-NEXT: v_mov_b32_e32 v42, v9 +; VI-NEXT: v_mov_b32_e32 v43, v8 +; VI-NEXT: v_mov_b32_e32 v44, v7 +; VI-NEXT: v_mov_b32_e32 v45, v6 +; VI-NEXT: v_mov_b32_e32 v46, v5 +; VI-NEXT: v_mov_b32_e32 v47, v4 +; VI-NEXT: v_mov_b32_e32 v56, v3 +; VI-NEXT: v_mov_b32_e32 v57, v2 +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_mov_b32_e32 v59, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB46_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v27, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v27, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v27, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v27, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v27, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v27, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v27, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v27, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v27, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v27, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v27, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v27, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v46, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v45, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v44, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v43, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v41, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v53, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v36, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v35, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v34, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB22_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB22_4: ; %end +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: .LBB46_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB46_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v27, 0x200 +; VI-NEXT: v_add_f16_sdwa v0, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v59 +; VI-NEXT: v_add_f16_sdwa v2, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v58 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v57 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_f16_sdwa v3, v56, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v56 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_add_f16_sdwa v4, v47, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v47 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_add_f16_sdwa v5, v46, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v46 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_add_f16_sdwa v6, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v45 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_add_f16_sdwa v7, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v44 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_add_f16_sdwa v8, v43, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v43 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_add_f16_sdwa v9, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v42 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_f16_sdwa v10, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v41 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_add_f16_sdwa v11, v40, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v40 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_add_f16_sdwa v12, v55, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v55 +; VI-NEXT: v_or_b32_e32 v12, v13, v12 +; VI-NEXT: v_add_f16_sdwa v13, v54, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v54 +; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_add_f16_sdwa v14, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v26, v28, v26 +; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: .LBB46_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v14i64_to_v56f16: +; GFX9-LABEL: bitcast_v56f16_to_v14i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v59, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v37, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v38, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; GFX9-NEXT: v_mov_b32_e32 v39, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; GFX9-NEXT: v_mov_b32_e32 v48, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v9 +; GFX9-NEXT: v_mov_b32_e32 v43, v8 +; GFX9-NEXT: v_mov_b32_e32 v44, v7 +; GFX9-NEXT: v_mov_b32_e32 v45, v6 +; GFX9-NEXT: v_mov_b32_e32 v46, v5 +; GFX9-NEXT: v_mov_b32_e32 v47, v4 +; GFX9-NEXT: v_mov_b32_e32 v56, v3 +; GFX9-NEXT: v_mov_b32_e32 v57, v2 +; GFX9-NEXT: v_mov_b32_e32 v58, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB46_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr45 @@ -14697,150 +33307,208 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 ; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB22_2: ; %Flow +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: .LBB46_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_4 +; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB22_4: ; %end +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: s_movk_i32 s7, 0x200 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB46_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v14i64_to_v56f16: +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14i64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -14848,1108 +33516,891 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v14i64_to_v56f16: +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14i64: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <14 x i64> %a, splat (i64 3) - %a2 = bitcast <14 x i64> %a1 to <56 x half> - br label %end - -cmp.false: - %a3 = bitcast <14 x i64> %a to <56 x half> - br label %end - -end: - %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <56 x half> %phi -} - -define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v56f16_to_v14i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v49 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v35 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v34 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v31 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; GCN-NEXT: v_or_b32_e32 v0, v46, v0 -; GCN-NEXT: v_or_b32_e32 v1, v44, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; GCN-NEXT: v_or_b32_e32 v2, v42, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v51 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v28, v14 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: v_or_b32_e32 v16, v55, v16 -; GCN-NEXT: v_or_b32_e32 v17, v52, v17 -; GCN-NEXT: v_or_b32_e32 v18, v50, v18 -; GCN-NEXT: v_or_b32_e32 v19, v48, v19 -; GCN-NEXT: v_or_b32_e32 v20, v38, v20 -; GCN-NEXT: v_or_b32_e32 v21, v36, v21 -; GCN-NEXT: v_or_b32_e32 v22, v34, v22 -; GCN-NEXT: v_or_b32_e32 v23, v33, v23 -; GCN-NEXT: v_or_b32_e32 v24, v35, v24 -; GCN-NEXT: v_or_b32_e32 v25, v37, v25 -; GCN-NEXT: v_or_b32_e32 v26, v39, v26 -; GCN-NEXT: v_or_b32_e32 v27, v49, v27 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v44 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v41 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v55 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v52 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v50 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v48 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v38 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v36 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_or_b32_e32 v15, v17, v16 -; GCN-NEXT: v_or_b32_e32 v16, v19, v18 -; GCN-NEXT: v_or_b32_e32 v17, v21, v20 -; GCN-NEXT: v_or_b32_e32 v18, v23, v22 -; GCN-NEXT: v_or_b32_e32 v19, v25, v24 -; GCN-NEXT: v_or_b32_e32 v20, v27, v26 -; GCN-NEXT: v_or_b32_e32 v21, v29, v28 -; GCN-NEXT: v_or_b32_e32 v22, v31, v30 -; GCN-NEXT: v_or_b32_e32 v23, v33, v32 -; GCN-NEXT: v_or_b32_e32 v24, v35, v34 -; GCN-NEXT: v_or_b32_e32 v25, v37, v36 -; GCN-NEXT: v_or_b32_e32 v26, v39, v38 -; GCN-NEXT: v_or_b32_e32 v27, v49, v48 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v56f16_to_v14i64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v27 -; VI-NEXT: v_mov_b32_e32 v33, v26 -; VI-NEXT: v_mov_b32_e32 v34, v25 -; VI-NEXT: v_mov_b32_e32 v35, v24 -; VI-NEXT: v_mov_b32_e32 v36, v23 -; VI-NEXT: v_mov_b32_e32 v37, v22 -; VI-NEXT: v_mov_b32_e32 v38, v21 -; VI-NEXT: v_mov_b32_e32 v39, v20 -; VI-NEXT: v_mov_b32_e32 v48, v19 -; VI-NEXT: v_mov_b32_e32 v49, v18 -; VI-NEXT: v_mov_b32_e32 v50, v17 -; VI-NEXT: v_mov_b32_e32 v51, v16 -; VI-NEXT: v_mov_b32_e32 v52, v15 -; VI-NEXT: v_mov_b32_e32 v53, v14 -; VI-NEXT: v_mov_b32_e32 v54, v13 -; VI-NEXT: v_mov_b32_e32 v55, v12 -; VI-NEXT: v_mov_b32_e32 v40, v11 -; VI-NEXT: v_mov_b32_e32 v41, v10 -; VI-NEXT: v_mov_b32_e32 v42, v9 -; VI-NEXT: v_mov_b32_e32 v43, v8 -; VI-NEXT: v_mov_b32_e32 v44, v7 -; VI-NEXT: v_mov_b32_e32 v45, v6 -; VI-NEXT: v_mov_b32_e32 v46, v5 -; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v56, v3 -; VI-NEXT: v_mov_b32_e32 v57, v2 -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_mov_b32_e32 v59, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 + +cmp.true: + %a1 = fadd <56 x half> %a, splat (half 0xH0200) + %a2 = bitcast <56 x half> %a1 to <14 x i64> + br label %end + +cmp.false: + %a3 = bitcast <56 x half> %a to <14 x i64> + br label %end + +end: + %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i64> %phi +} + +define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56f16_to_v14i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v38, v12 +; SI-NEXT: v_or_b32_e32 v13, v36, v13 +; SI-NEXT: v_or_b32_e32 v14, v35, v14 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_or_b32_e32 v17, v37, v17 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v50, v27 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v52, v37 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_mov_b32_e32 v29, v37 +; SI-NEXT: v_mov_b32_e32 v37, v52 +; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v56f16_to_v14i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v32, v13 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v38, v7 +; VI-NEXT: v_mov_b32_e32 v39, v6 +; VI-NEXT: v_mov_b32_e32 v48, v5 +; VI-NEXT: v_mov_b32_e32 v49, v4 +; VI-NEXT: v_mov_b32_e32 v50, v3 +; VI-NEXT: v_mov_b32_e32 v51, v2 +; VI-NEXT: v_mov_b32_e32 v52, v1 +; VI-NEXT: v_mov_b32_e32 v53, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB47_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v27, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v27, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v27, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v27, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v27, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v27, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v27, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v27, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v27, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v27, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v27, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v27, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v46, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v45, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v44, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v43, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v41, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v53, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v52, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v36, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v35, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v34, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v33, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB23_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v27, 0x200 -; VI-NEXT: v_add_f16_sdwa v0, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v59 -; VI-NEXT: v_add_f16_sdwa v2, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v58 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v1, v3, v2 -; VI-NEXT: v_add_f16_sdwa v2, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v57 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_add_f16_sdwa v3, v56, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v56 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 ; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_add_f16_sdwa v4, v47, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v47 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 ; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: v_add_f16_sdwa v5, v46, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v46 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 ; VI-NEXT: v_or_b32_e32 v5, v6, v5 -; VI-NEXT: v_add_f16_sdwa v6, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v45 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 ; VI-NEXT: v_or_b32_e32 v6, v7, v6 -; VI-NEXT: v_add_f16_sdwa v7, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v44 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 ; VI-NEXT: v_or_b32_e32 v7, v8, v7 -; VI-NEXT: v_add_f16_sdwa v8, v43, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v43 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 ; VI-NEXT: v_or_b32_e32 v8, v9, v8 -; VI-NEXT: v_add_f16_sdwa v9, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, 0x200, v42 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: v_add_f16_sdwa v10, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, 0x200, v41 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 ; VI-NEXT: v_or_b32_e32 v10, v11, v10 -; VI-NEXT: v_add_f16_sdwa v11, v40, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, 0x200, v40 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 ; VI-NEXT: v_or_b32_e32 v11, v12, v11 -; VI-NEXT: v_add_f16_sdwa v12, v55, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v13, 0x200, v55 -; VI-NEXT: v_or_b32_e32 v12, v13, v12 -; VI-NEXT: v_add_f16_sdwa v13, v54, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, 0x200, v54 -; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v27, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 ; VI-NEXT: v_add_f16_sdwa v14, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v15, 0x200, v53 ; VI-NEXT: v_or_b32_e32 v14, v15, v14 @@ -15988,171 +34439,1077 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v25, v26, v25 ; VI-NEXT: v_add_f16_sdwa v26, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v28, 0x200, v33 -; VI-NEXT: v_or_b32_e32 v26, v28, v26 -; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 -; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: .LBB23_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v56f16_to_v14i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v59, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v37, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v38, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; GFX9-NEXT: v_mov_b32_e32 v39, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GFX9-NEXT: v_mov_b32_e32 v48, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v9 -; GFX9-NEXT: v_mov_b32_e32 v43, v8 -; GFX9-NEXT: v_mov_b32_e32 v44, v7 -; GFX9-NEXT: v_mov_b32_e32 v45, v6 -; GFX9-NEXT: v_mov_b32_e32 v46, v5 -; GFX9-NEXT: v_mov_b32_e32 v47, v4 -; GFX9-NEXT: v_mov_b32_e32 v56, v3 -; GFX9-NEXT: v_mov_b32_e32 v57, v2 -; GFX9-NEXT: v_mov_b32_e32 v58, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: v_or_b32_e32 v26, v28, v26 +; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v56f16_to_v14i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v13 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v34, v11 +; GFX9-NEXT: v_mov_b32_e32 v35, v10 +; GFX9-NEXT: v_mov_b32_e32 v36, v9 +; GFX9-NEXT: v_mov_b32_e32 v37, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v7 +; GFX9-NEXT: v_mov_b32_e32 v39, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v5 +; GFX9-NEXT: v_mov_b32_e32 v49, v4 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v51, v2 +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v27 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB47_3: ; %end +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB47_2 +; +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v33, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v7 :: v_dual_mov_b32 v35, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v5 :: v_dual_mov_b32 v37, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_mov_b32 v49, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB47_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB47_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <56 x half> %a, splat (half 0xH0200) + %a2 = bitcast <56 x half> %a1 to <14 x i64> + br label %end + +cmp.false: + %a3 = bitcast <56 x half> %a to <14 x i64> + br label %end + +end: + %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i64> %phi +} + +define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v14f64_to_v56i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v36, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v38, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v48, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v53, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v36, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v38, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v48, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v53, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f64_to_v56i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB48_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB48_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f64_to_v56i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr45 @@ -16172,208 +35529,136 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: .LBB23_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: s_movk_i32 s7, 0x200 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_4: ; %end +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB48_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB48_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB48_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14i64: +; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56i16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -16381,580 +35666,654 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB48_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14i64: +; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56i16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = fadd <56 x half> %a, splat (half 0xH0200) - %a2 = bitcast <56 x half> %a1 to <14 x i64> - br label %end - -cmp.false: - %a3 = bitcast <56 x half> %a to <14 x i64> - br label %end - -end: - %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <14 x i64> %phi -} - -define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f64_to_v56i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v51, v6, v5, 16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_alignbit_b32 v45, v4, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_alignbit_b32 v46, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v51, v6, v5, 16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_alignbit_b32 v45, v4, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_alignbit_b32 v46, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v46, v57, v46 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v56, v58, v56 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 20, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v3, v3, v45 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 24, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v4, v4, v47 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 28, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v5, v5, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 32, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v6, v6, v44 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v7, v7, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 40, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v8, v8, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v9, v9, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v10, v10, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v11, v11, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 56, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v12, v12, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v13, v13, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v14, v14, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v15, v15, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v16, v16, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x4c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v17, v17, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x50, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v18, v18, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x54, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v19, v19, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v20, v20, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x5c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v21, v21, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v22, v22, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v23, v23, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x68, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v24, v24, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x6c, v0 -; GCN-NEXT: v_or_b32_e32 v25, v25, v30 -; GCN-NEXT: v_or_b32_e32 v26, v26, v49 -; GCN-NEXT: v_or_b32_e32 v27, v27, v29 -; GCN-NEXT: v_or_b32_e32 v28, v28, v39 -; GCN-NEXT: buffer_store_dword v46, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v14f64_to_v56i16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB24_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB24_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <14 x double> %a1 to <56 x i16> + br label %end + +cmp.false: + %a3 = bitcast <14 x double> %a to <56 x i16> + br label %end + +end: + %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x i16> %phi +} + +define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f64_to_v56i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-NEXT: v_mov_b32_e32 v27, s16 +; SI-NEXT: v_mov_b32_e32 v28, s17 +; SI-NEXT: v_mov_b32_e32 v23, s18 +; SI-NEXT: v_mov_b32_e32 v24, s19 +; SI-NEXT: v_mov_b32_e32 v25, s20 +; SI-NEXT: v_mov_b32_e32 v26, s21 +; SI-NEXT: v_mov_b32_e32 v21, s22 +; SI-NEXT: v_mov_b32_e32 v22, s23 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_mov_b32_e32 v20, s25 +; SI-NEXT: v_mov_b32_e32 v17, s26 +; SI-NEXT: v_mov_b32_e32 v18, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v15, s28 +; SI-NEXT: v_mov_b32_e32 v16, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v32, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v36, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v38, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v48, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v51, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v53, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v40, v26, v25, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v24, v23, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v28, v27, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v28 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_alignbit_b32 v29, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v32, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v36, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v38, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v48, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v51, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v53, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v40, v26, v25, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v24, v23, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v28, v27, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v28 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_or_b32_e32 v27, v27, v44 +; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v56 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v42 +; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v46 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v45 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v55 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v14f64_to_v56i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v19, s16 +; VI-NEXT: v_mov_b32_e32 v20, s17 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v32, s20 +; VI-NEXT: v_mov_b32_e32 v33, s21 +; VI-NEXT: v_mov_b32_e32 v25, s22 +; VI-NEXT: v_mov_b32_e32 v26, s23 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s25 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v19, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v47 +; VI-NEXT: v_or_b32_sdwa v32, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 +; VI-NEXT: v_or_b32_sdwa v33, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; VI-NEXT: v_or_b32_sdwa v38, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v39 +; VI-NEXT: v_or_b32_sdwa v39, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v48, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; VI-NEXT: v_or_b32_sdwa v49, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -16963,53 +36322,87 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v28 +; VI-NEXT: v_mov_b32_e32 v1, v29 +; VI-NEXT: v_mov_b32_e32 v2, v30 +; VI-NEXT: v_mov_b32_e32 v3, v31 +; VI-NEXT: v_mov_b32_e32 v4, v32 +; VI-NEXT: v_mov_b32_e32 v5, v33 +; VI-NEXT: v_mov_b32_e32 v6, v34 +; VI-NEXT: v_mov_b32_e32 v7, v35 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v10, v38 +; VI-NEXT: v_mov_b32_e32 v11, v39 +; VI-NEXT: v_mov_b32_e32 v12, v48 +; VI-NEXT: v_mov_b32_e32 v13, v49 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_branch .LBB49_2 ; -; GFX9-LABEL: bitcast_v14f64_to_v56i16: +; GFX9-LABEL: bitcast_v14f64_to_v56i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v19, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v32, s20 +; GFX9-NEXT: v_mov_b32_e32 v33, s21 +; GFX9-NEXT: v_mov_b32_e32 v25, s22 +; GFX9-NEXT: v_mov_b32_e32 v26, s23 +; GFX9-NEXT: v_mov_b32_e32 v23, s24 +; GFX9-NEXT: v_mov_b32_e32 v24, s25 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v22, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -17018,77 +36411,38 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB24_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -17096,45 +36450,84 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB24_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v32, v47, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v33, v46, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v17, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v18, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v40, 16, v0 ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -17143,62 +36536,418 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-NEXT: v_mov_b32_e32 v3, v31 +; GFX9-NEXT: v_mov_b32_e32 v4, v32 +; GFX9-NEXT: v_mov_b32_e32 v5, v33 +; GFX9-NEXT: v_mov_b32_e32 v6, v34 +; GFX9-NEXT: v_mov_b32_e32 v7, v35 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v9, v37 +; GFX9-NEXT: v_mov_b32_e32 v10, v38 +; GFX9-NEXT: v_mov_b32_e32 v11, v39 +; GFX9-NEXT: v_mov_b32_e32 v12, v48 +; GFX9-NEXT: v_mov_b32_e32 v13, v49 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: s_branch .LBB49_2 ; -; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56i16: +; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56i16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v16, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v14, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX11-TRUE16-NEXT: .LBB49_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v37, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v71, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v36, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v69, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v64, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_and_b32 v1, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v34, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v67, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v66, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v65, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v35, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 ; -; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56i16: +; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56i16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX11-FAKE16-NEXT: .LBB49_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 @@ -17216,127 +36965,7 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -17355,521 +36984,493 @@ end: } define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v56i16_to_v14f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v41 -; GCN-NEXT: v_or_b32_e32 v1, v1, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v63 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v62 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v61 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v60 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v59 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v58 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v56 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v47 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v11, v11, v46 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v44 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v43 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v57 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v28 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v57 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v0, v41, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_or_b32_e32 v2, v63, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v62, v4 -; GCN-NEXT: v_or_b32_e32 v5, v61, v5 -; GCN-NEXT: v_or_b32_e32 v6, v60, v6 -; GCN-NEXT: v_or_b32_e32 v7, v59, v7 -; GCN-NEXT: v_or_b32_e32 v8, v58, v8 -; GCN-NEXT: v_or_b32_e32 v9, v56, v9 -; GCN-NEXT: v_or_b32_e32 v10, v47, v10 -; GCN-NEXT: v_or_b32_e32 v11, v46, v11 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v28, v12 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v28, v13 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v28, v14 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v28, v16 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v28, v17 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v28, v18 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v28, v19 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v28, v20 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v28, v21 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v28, v22 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v28, v23 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v28, v24 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v28, v25 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v28, v26 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v56i16_to_v14f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v59 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_or_b32_e32 v2, v2, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v49 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v3, v3, v57 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v6, v6, v46 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v45 +; SI-NEXT: v_or_b32_e32 v9, v9, v44 +; SI-NEXT: v_or_b32_e32 v10, v10, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_or_b32_e32 v13, v13, v63 +; SI-NEXT: v_or_b32_e32 v14, v14, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v56 +; SI-NEXT: v_or_b32_e32 v18, v18, v47 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v20, v20, v36 +; SI-NEXT: v_or_b32_e32 v21, v21, v35 +; SI-NEXT: v_or_b32_e32 v22, v22, v43 +; SI-NEXT: v_or_b32_e32 v23, v23, v42 +; SI-NEXT: v_or_b32_e32 v24, v24, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v40 +; SI-NEXT: v_or_b32_e32 v26, v26, v62 +; SI-NEXT: v_or_b32_e32 v27, v27, v60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v16, v49, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v3, v57, v3 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: v_or_b32_e32 v9, v44, v9 +; SI-NEXT: v_or_b32_e32 v10, v34, v10 +; SI-NEXT: v_or_b32_e32 v11, v33, v11 +; SI-NEXT: v_or_b32_e32 v12, v32, v12 +; SI-NEXT: v_or_b32_e32 v13, v63, v13 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: v_or_b32_e32 v15, v50, v15 +; SI-NEXT: v_or_b32_e32 v17, v56, v17 +; SI-NEXT: v_or_b32_e32 v18, v47, v18 +; SI-NEXT: v_or_b32_e32 v19, v38, v19 +; SI-NEXT: v_or_b32_e32 v20, v36, v20 +; SI-NEXT: v_or_b32_e32 v21, v35, v21 +; SI-NEXT: v_or_b32_e32 v22, v43, v22 +; SI-NEXT: v_or_b32_e32 v23, v42, v23 +; SI-NEXT: v_or_b32_e32 v24, v41, v24 +; SI-NEXT: v_or_b32_e32 v25, v40, v25 +; SI-NEXT: v_or_b32_e32 v26, v62, v26 +; SI-NEXT: v_or_b32_e32 v27, v60, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v56i16_to_v14f64: ; VI: ; %bb.0: @@ -17918,7 +37519,7 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v27, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -18005,9 +37606,9 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v27, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v59 @@ -18094,7 +37695,7 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v28, 3, v32 ; VI-NEXT: v_add_u16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -18111,150 +37712,2547 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v56i16_to_v14f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v59, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v37, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v38, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; GFX9-NEXT: v_mov_b32_e32 v39, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GFX9-NEXT: v_mov_b32_e32 v48, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v9 -; GFX9-NEXT: v_mov_b32_e32 v43, v8 -; GFX9-NEXT: v_mov_b32_e32 v44, v7 -; GFX9-NEXT: v_mov_b32_e32 v45, v6 -; GFX9-NEXT: v_mov_b32_e32 v46, v5 -; GFX9-NEXT: v_mov_b32_e32 v47, v4 -; GFX9-NEXT: v_mov_b32_e32 v56, v3 -; GFX9-NEXT: v_mov_b32_e32 v57, v2 -; GFX9-NEXT: v_mov_b32_e32 v58, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-LABEL: bitcast_v56i16_to_v14f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v59, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v37, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v38, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; GFX9-NEXT: v_mov_b32_e32 v39, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; GFX9-NEXT: v_mov_b32_e32 v48, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v9 +; GFX9-NEXT: v_mov_b32_e32 v43, v8 +; GFX9-NEXT: v_mov_b32_e32 v44, v7 +; GFX9-NEXT: v_mov_b32_e32 v45, v6 +; GFX9-NEXT: v_mov_b32_e32 v46, v5 +; GFX9-NEXT: v_mov_b32_e32 v47, v4 +; GFX9-NEXT: v_mov_b32_e32 v56, v3 +; GFX9-NEXT: v_mov_b32_e32 v57, v2 +; GFX9-NEXT: v_mov_b32_e32 v58, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB50_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: .LBB50_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB50_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB50_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <56 x i16> %a, splat (i16 3) + %a2 = bitcast <56 x i16> %a1 to <14 x double> + br label %end + +cmp.false: + %a3 = bitcast <56 x i16> %a to <14 x double> + br label %end + +end: + %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x double> %phi +} + +define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56i16_to_v14f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 +; SI-NEXT: v_mov_b32_e32 v32, v26 +; SI-NEXT: v_mov_b32_e32 v33, v24 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v12 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v12, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v13, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v14, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v16, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v17, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v8, v1, v18 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v27, v0, v63 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v41 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mov_b32_e32 v41, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: v_mov_b32_e32 v42, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v60 +; SI-NEXT: v_mov_b32_e32 v60, v57 +; SI-NEXT: v_mov_b32_e32 v57, v46 +; SI-NEXT: v_mov_b32_e32 v46, v61 +; SI-NEXT: v_mov_b32_e32 v61, v58 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v31, v47 +; SI-NEXT: v_mov_b32_e32 v47, v58 +; SI-NEXT: v_mov_b32_e32 v58, v61 +; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v46, v57 +; SI-NEXT: v_mov_b32_e32 v57, v60 +; SI-NEXT: v_mov_b32_e32 v60, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v48 +; SI-NEXT: v_mov_b32_e32 v48, v42 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v33, v36 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: v_mov_b32_e32 v56, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v56i16_to_v14f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v39, v13 +; VI-NEXT: v_mov_b32_e32 v37, v12 +; VI-NEXT: v_mov_b32_e32 v35, v11 +; VI-NEXT: v_mov_b32_e32 v34, v10 +; VI-NEXT: v_mov_b32_e32 v33, v9 +; VI-NEXT: v_mov_b32_e32 v32, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v38, v6 +; VI-NEXT: v_mov_b32_e32 v48, v5 +; VI-NEXT: v_mov_b32_e32 v49, v4 +; VI-NEXT: v_mov_b32_e32 v50, v3 +; VI-NEXT: v_mov_b32_e32 v51, v2 +; VI-NEXT: v_mov_b32_e32 v53, v1 +; VI-NEXT: v_mov_b32_e32 v52, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: v_or_b32_sdwa v14, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_or_b32_sdwa v20, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v22, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v23, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v24, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v56i16_to_v14f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v13 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v34, v11 +; GFX9-NEXT: v_mov_b32_e32 v35, v10 +; GFX9-NEXT: v_mov_b32_e32 v36, v9 +; GFX9-NEXT: v_mov_b32_e32 v37, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v7 +; GFX9-NEXT: v_mov_b32_e32 v39, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v5 +; GFX9-NEXT: v_mov_b32_e32 v49, v4 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v51, v2 +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v27 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v33, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v7 :: v_dual_mov_b32 v35, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v5 :: v_dual_mov_b32 v37, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_mov_b32 v49, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <56 x i16> %a, splat (i16 3) + %a2 = bitcast <56 x i16> %a1 to <14 x double> + br label %end + +cmp.false: + %a3 = bitcast <56 x i16> %a to <14 x double> + br label %end + +end: + %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x double> %phi +} + +define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v14f64_to_v56f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 +; SI-NEXT: v_mov_b32_e32 v29, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[54:55], v[1:2], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v47, v26 +; SI-NEXT: v_mov_b32_e32 v45, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v55, v1 +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f64_to_v56f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB52_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB52_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB52_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB52_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f64_to_v56f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr45 @@ -18274,207 +40272,136 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: .LBB25_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB52_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB52_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB52_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB52_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14f64: +; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -18482,890 +40409,843 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14f64: +; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB52_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <56 x i16> %a, splat (i16 3) - %a2 = bitcast <56 x i16> %a1 to <14 x double> + %a1 = fadd <14 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <14 x double> %a1 to <56 x half> br label %end cmp.false: - %a3 = bitcast <56 x i16> %a to <14 x double> + %a3 = bitcast <14 x double> %a to <56 x half> br label %end end: - %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <14 x double> %phi + %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x half> %phi } -define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f64_to_v56f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v40 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v44 -; GCN-NEXT: v_mov_b32_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v46 -; GCN-NEXT: v_mov_b32_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v56 -; GCN-NEXT: v_mov_b32_e32 v56, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v58 -; GCN-NEXT: v_mov_b32_e32 v58, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v60 -; GCN-NEXT: v_mov_b32_e32 v60, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v62 -; GCN-NEXT: v_mov_b32_e32 v62, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_mov_b32_e32 v43, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v45 -; GCN-NEXT: v_mov_b32_e32 v45, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v47 -; GCN-NEXT: v_mov_b32_e32 v47, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v57 -; GCN-NEXT: v_mov_b32_e32 v57, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v59 -; GCN-NEXT: v_mov_b32_e32 v59, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v61 -; GCN-NEXT: v_mov_b32_e32 v61, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v63 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v39 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v37 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v5, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v7, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v49 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v48 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v38 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v36 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v34 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v21, v21, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v32 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v23, v23, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v25, v25, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v61 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v59 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v57 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v47 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v45 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v43 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v42 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v44 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v46 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v56 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v58 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x68, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v60 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x6c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v62 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_or_b32_e32 v45, v47, v46 -; GCN-NEXT: v_or_b32_e32 v46, v57, v56 -; GCN-NEXT: v_or_b32_e32 v47, v59, v58 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f64_to_v56f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-NEXT: v_readfirstlane_b32 s40, v1 +; SI-NEXT: v_readfirstlane_b32 s41, v2 +; SI-NEXT: v_readfirstlane_b32 s14, v3 +; SI-NEXT: v_readfirstlane_b32 s15, v4 +; SI-NEXT: v_readfirstlane_b32 s12, v5 +; SI-NEXT: v_readfirstlane_b32 s13, v6 +; SI-NEXT: v_readfirstlane_b32 s10, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v8 +; SI-NEXT: v_readfirstlane_b32 s8, v9 +; SI-NEXT: v_readfirstlane_b32 s9, v10 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: v_readfirstlane_b32 s7, v12 +; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: s_and_b64 s[42:43], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s42, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: s_lshr_b32 s42, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 +; SI-NEXT: s_lshr_b32 s42, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s42 +; SI-NEXT: s_lshr_b32 s42, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 +; SI-NEXT: s_lshr_b32 s42, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 +; SI-NEXT: s_lshr_b32 s42, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: s_lshr_b32 s42, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s42 +; SI-NEXT: s_lshr_b32 s42, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 +; SI-NEXT: s_lshr_b32 s42, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s42 +; SI-NEXT: s_lshr_b32 s42, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s42 +; SI-NEXT: s_lshr_b32 s42, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s42 +; SI-NEXT: s_lshr_b32 s42, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s42 +; SI-NEXT: s_lshr_b32 s42, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s42 +; SI-NEXT: s_lshr_b32 s42, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s42 +; SI-NEXT: s_lshr_b32 s42, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s42 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s42 +; SI-NEXT: s_lshr_b32 s42, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s42 +; SI-NEXT: s_lshr_b32 s42, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s42 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s42 +; SI-NEXT: s_lshr_b32 s42, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s42 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s42 +; SI-NEXT: s_lshr_b32 s42, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s42 +; SI-NEXT: s_lshr_b32 s42, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s42 +; SI-NEXT: s_lshr_b32 s42, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s42 +; SI-NEXT: s_lshr_b32 s42, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s42 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s42 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s42 +; SI-NEXT: s_lshr_b32 s42, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v57, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s7 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s6 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[42:43], s[18:19], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v42 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v41, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v42 +; SI-NEXT: v_add_f64 v[49:50], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[37:38], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v44 +; SI-NEXT: v_add_f64 v[53:54], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[35:36], s[26:27], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v46 +; SI-NEXT: v_add_f64 v[30:31], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], s[40:41], 1.0 +; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v49, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v5, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v6, v14, v6 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 +; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 +; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 +; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 +; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 +; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 +; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 +; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 +; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 +; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 +; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v59 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_branch .LBB53_2 ; -; VI-LABEL: bitcast_v14f64_to_v56f16: +; VI-LABEL: bitcast_v14f64_to_v56f16_scalar: ; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB26_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB26_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v19, s16 +; VI-NEXT: v_mov_b32_e32 v20, s17 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v32, s20 +; VI-NEXT: v_mov_b32_e32 v33, s21 +; VI-NEXT: v_mov_b32_e32 v25, s22 +; VI-NEXT: v_mov_b32_e32 v26, s23 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s25 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v19, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v47 +; VI-NEXT: v_or_b32_sdwa v32, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 +; VI-NEXT: v_or_b32_sdwa v33, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; VI-NEXT: v_or_b32_sdwa v38, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v39 +; VI-NEXT: v_or_b32_sdwa v39, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v48, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; VI-NEXT: v_or_b32_sdwa v49, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -19374,53 +41254,87 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v28 +; VI-NEXT: v_mov_b32_e32 v1, v29 +; VI-NEXT: v_mov_b32_e32 v2, v30 +; VI-NEXT: v_mov_b32_e32 v3, v31 +; VI-NEXT: v_mov_b32_e32 v4, v32 +; VI-NEXT: v_mov_b32_e32 v5, v33 +; VI-NEXT: v_mov_b32_e32 v6, v34 +; VI-NEXT: v_mov_b32_e32 v7, v35 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v10, v38 +; VI-NEXT: v_mov_b32_e32 v11, v39 +; VI-NEXT: v_mov_b32_e32 v12, v48 +; VI-NEXT: v_mov_b32_e32 v13, v49 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_branch .LBB53_2 ; -; GFX9-LABEL: bitcast_v14f64_to_v56f16: +; GFX9-LABEL: bitcast_v14f64_to_v56f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v19, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v32, s20 +; GFX9-NEXT: v_mov_b32_e32 v33, s21 +; GFX9-NEXT: v_mov_b32_e32 v25, s22 +; GFX9-NEXT: v_mov_b32_e32 v26, s23 +; GFX9-NEXT: v_mov_b32_e32 v23, s24 +; GFX9-NEXT: v_mov_b32_e32 v24, s25 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v22, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -19429,77 +41343,38 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB26_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -19507,45 +41382,84 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB26_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v32, v47, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v33, v46, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v17, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v18, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v40, 16, v0 ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -19554,62 +41468,418 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-NEXT: v_mov_b32_e32 v3, v31 +; GFX9-NEXT: v_mov_b32_e32 v4, v32 +; GFX9-NEXT: v_mov_b32_e32 v5, v33 +; GFX9-NEXT: v_mov_b32_e32 v6, v34 +; GFX9-NEXT: v_mov_b32_e32 v7, v35 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v9, v37 +; GFX9-NEXT: v_mov_b32_e32 v10, v38 +; GFX9-NEXT: v_mov_b32_e32 v11, v39 +; GFX9-NEXT: v_mov_b32_e32 v12, v48 +; GFX9-NEXT: v_mov_b32_e32 v13, v49 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: s_branch .LBB53_2 ; -; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v16, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v14, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX11-TRUE16-NEXT: .LBB53_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v37, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v71, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v36, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v69, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v64, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_and_b32 v1, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v34, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v67, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v66, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v65, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v35, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB53_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB53_2 ; -; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56f16: +; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56f16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX11-FAKE16-NEXT: .LBB53_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB53_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 @@ -19627,127 +41897,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: s_branch .LBB53_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -19766,703 +41916,685 @@ end: } define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v56f16_to_v14f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v49 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v35 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v34 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v31 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; GCN-NEXT: v_or_b32_e32 v0, v46, v0 -; GCN-NEXT: v_or_b32_e32 v1, v44, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; GCN-NEXT: v_or_b32_e32 v2, v42, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v51 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v28, v14 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: v_or_b32_e32 v16, v55, v16 -; GCN-NEXT: v_or_b32_e32 v17, v52, v17 -; GCN-NEXT: v_or_b32_e32 v18, v50, v18 -; GCN-NEXT: v_or_b32_e32 v19, v48, v19 -; GCN-NEXT: v_or_b32_e32 v20, v38, v20 -; GCN-NEXT: v_or_b32_e32 v21, v36, v21 -; GCN-NEXT: v_or_b32_e32 v22, v34, v22 -; GCN-NEXT: v_or_b32_e32 v23, v33, v23 -; GCN-NEXT: v_or_b32_e32 v24, v35, v24 -; GCN-NEXT: v_or_b32_e32 v25, v37, v25 -; GCN-NEXT: v_or_b32_e32 v26, v39, v26 -; GCN-NEXT: v_or_b32_e32 v27, v49, v27 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v44 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v41 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v55 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v52 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v50 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v48 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v38 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v36 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_or_b32_e32 v15, v17, v16 -; GCN-NEXT: v_or_b32_e32 v16, v19, v18 -; GCN-NEXT: v_or_b32_e32 v17, v21, v20 -; GCN-NEXT: v_or_b32_e32 v18, v23, v22 -; GCN-NEXT: v_or_b32_e32 v19, v25, v24 -; GCN-NEXT: v_or_b32_e32 v20, v27, v26 -; GCN-NEXT: v_or_b32_e32 v21, v29, v28 -; GCN-NEXT: v_or_b32_e32 v22, v31, v30 -; GCN-NEXT: v_or_b32_e32 v23, v33, v32 -; GCN-NEXT: v_or_b32_e32 v24, v35, v34 -; GCN-NEXT: v_or_b32_e32 v25, v37, v36 -; GCN-NEXT: v_or_b32_e32 v26, v39, v38 -; GCN-NEXT: v_or_b32_e32 v27, v49, v48 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v56f16_to_v14f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_or_b32_e32 v21, v60, v21 +; SI-NEXT: v_or_b32_e32 v22, v58, v22 +; SI-NEXT: v_or_b32_e32 v23, v48, v23 +; SI-NEXT: v_or_b32_e32 v24, v38, v24 +; SI-NEXT: v_or_b32_e32 v25, v36, v25 +; SI-NEXT: v_or_b32_e32 v26, v34, v26 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v56 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63 +; SI-NEXT: v_or_b32_e32 v20, v62, v20 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v56f16_to_v14f64: ; VI: ; %bb.0: @@ -20511,7 +42643,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v27, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -20598,9 +42730,9 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: .LBB54_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_4 +; VI-NEXT: s_cbranch_execz .LBB54_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v27, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -20687,7 +42819,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: .LBB27_4: ; %end +; VI-NEXT: .LBB54_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -20802,7 +42934,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload @@ -20942,9 +43074,9 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: .LBB54_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload @@ -21047,7 +43179,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 ; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_4: ; %end +; GFX9-NEXT: .LBB54_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -21076,7 +43208,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -21106,7 +43238,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: .LBB54_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -21174,7 +43306,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -21204,9 +43336,1343 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB27_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB54_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <56 x half> %a, splat (half 0xH0200) + %a2 = bitcast <56 x half> %a1 to <14 x double> + br label %end + +cmp.false: + %a3 = bitcast <56 x half> %a to <14 x double> + br label %end + +end: + %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x double> %phi +} + +define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56f16_to_v14f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v38, v12 +; SI-NEXT: v_or_b32_e32 v13, v36, v13 +; SI-NEXT: v_or_b32_e32 v14, v35, v14 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_or_b32_e32 v17, v37, v17 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v50, v27 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v52, v37 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_mov_b32_e32 v29, v37 +; SI-NEXT: v_mov_b32_e32 v37, v52 +; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v56f16_to_v14f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v32, v13 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v38, v7 +; VI-NEXT: v_mov_b32_e32 v39, v6 +; VI-NEXT: v_mov_b32_e32 v48, v5 +; VI-NEXT: v_mov_b32_e32 v49, v4 +; VI-NEXT: v_mov_b32_e32 v50, v3 +; VI-NEXT: v_mov_b32_e32 v51, v2 +; VI-NEXT: v_mov_b32_e32 v52, v1 +; VI-NEXT: v_mov_b32_e32 v53, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v27, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v26, v28, v26 +; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v56f16_to_v14f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v13 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v34, v11 +; GFX9-NEXT: v_mov_b32_e32 v35, v10 +; GFX9-NEXT: v_mov_b32_e32 v36, v9 +; GFX9-NEXT: v_mov_b32_e32 v37, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v7 +; GFX9-NEXT: v_mov_b32_e32 v39, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v5 +; GFX9-NEXT: v_mov_b32_e32 v49, v4 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v51, v2 +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v27 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v33, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v7 :: v_dual_mov_b32 v35, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v5 :: v_dual_mov_b32 v37, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_mov_b32 v49, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB55_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB55_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB55_2 +; +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB55_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB55_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB55_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -21225,882 +44691,901 @@ end: } define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v56i16_to_v56f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v58 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v58 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v42 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v43 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v44 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v45 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v5, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v41 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v7, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v40 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v21, v21, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v23, v23, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v25, v25, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_or_b32_e32 v45, v47, v46 -; GCN-NEXT: v_or_b32_e32 v46, v57, v56 -; GCN-NEXT: v_or_b32_e32 v47, v59, v58 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v56i16_to_v56f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v56i16_to_v56f16: ; VI: ; %bb.0: @@ -22146,7 +45631,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v56, 3, v56 @@ -22204,7 +45689,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v30, 3, v30 ; VI-NEXT: v_add_u16_e32 v27, 3, v27 ; VI-NEXT: v_add_u16_e32 v29, 3, v29 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v56 ; VI-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -22318,7 +45803,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v27, v56, v27, s6 @@ -22405,7 +45890,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v26 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v27 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v19, v40, v19, s4 @@ -22456,7 +45941,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] @@ -22486,7 +45971,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: .LBB56_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -22526,7 +46011,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v27, v80, v27, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v26, v71, v26, 0x5040100 @@ -22612,7 +46097,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v26 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v27 -; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: .LBB56_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v29, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v30, v1, 0x5040100 @@ -22660,609 +46145,2378 @@ end: ret <56 x half> %phi } +define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56i16_to_v56f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB57_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s19 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v25 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v57 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v35 +; SI-NEXT: v_mov_b32_e32 v47, v34 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v61 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v58 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s27 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s29 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v19 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v59 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_branch .LBB57_3 +; SI-NEXT: .LBB57_2: +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: v_mov_b32_e32 v47, v34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: .LBB57_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v36 +; SI-NEXT: v_mov_b32_e32 v36, v48 +; SI-NEXT: v_mov_b32_e32 v48, v50 +; SI-NEXT: v_mov_b32_e32 v50, v52 +; SI-NEXT: v_mov_b32_e32 v52, v54 +; SI-NEXT: v_mov_b32_e32 v54, v40 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v42, v44 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: s_cbranch_vccnz .LBB57_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v47 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s23 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s25 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s27 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s19 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v46 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v63 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v58 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v48, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: .LBB57_5: ; %end +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v56i16_to_v56f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: s_or_b32 s15, s18, s15 +; VI-NEXT: s_and_b32 s18, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: s_or_b32 s14, s18, s14 +; VI-NEXT: s_and_b32 s18, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: s_or_b32 s13, s18, s13 +; VI-NEXT: s_and_b32 s18, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: s_or_b32 s12, s18, s12 +; VI-NEXT: s_and_b32 s18, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; VI-NEXT: s_or_b32 s11, s18, s11 +; VI-NEXT: s_and_b32 s18, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: s_or_b32 s10, s18, s10 +; VI-NEXT: s_and_b32 s18, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: s_or_b32 s9, s18, s9 +; VI-NEXT: s_and_b32 s18, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: s_or_b32 s8, s18, s8 +; VI-NEXT: s_and_b32 s18, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; VI-NEXT: s_or_b32 s7, s18, s7 +; VI-NEXT: s_and_b32 s18, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; VI-NEXT: s_or_b32 s6, s18, s6 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v56i16_to_v56f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_pk_add_u16 v49, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_u16 v48, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_u16 v39, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_u16 v38, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_u16 v37, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_u16 v36, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_u16 v35, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_u16 v34, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_u16 v33, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_u16 v32, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_u16 v31, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pk_add_u16 v30, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_lshl_or_b32 v13, v27, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v12, v26, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v11, v25, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v10, v24, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v23, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v8, v22, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v29, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: s_branch .LBB57_5 +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v49, s29 +; GFX9-NEXT: v_mov_b32_e32 v48, s28 +; GFX9-NEXT: v_mov_b32_e32 v39, s27 +; GFX9-NEXT: v_mov_b32_e32 v38, s26 +; GFX9-NEXT: v_mov_b32_e32 v37, s25 +; GFX9-NEXT: v_mov_b32_e32 v36, s24 +; GFX9-NEXT: v_mov_b32_e32 v35, s23 +; GFX9-NEXT: v_mov_b32_e32 v34, s22 +; GFX9-NEXT: v_mov_b32_e32 v33, s21 +; GFX9-NEXT: v_mov_b32_e32 v32, s20 +; GFX9-NEXT: v_mov_b32_e32 v31, s19 +; GFX9-NEXT: v_mov_b32_e32 v30, s18 +; GFX9-NEXT: v_mov_b32_e32 v29, s17 +; GFX9-NEXT: v_mov_b32_e32 v28, s16 +; GFX9-NEXT: v_mov_b32_e32 v50, s43 +; GFX9-NEXT: v_mov_b32_e32 v51, s42 +; GFX9-NEXT: v_mov_b32_e32 v52, s41 +; GFX9-NEXT: v_mov_b32_e32 v53, s40 +; GFX9-NEXT: v_mov_b32_e32 v54, s15 +; GFX9-NEXT: v_mov_b32_e32 v55, s14 +; GFX9-NEXT: v_mov_b32_e32 v40, s13 +; GFX9-NEXT: v_mov_b32_e32 v41, s12 +; GFX9-NEXT: v_mov_b32_e32 v42, s11 +; GFX9-NEXT: v_mov_b32_e32 v43, s10 +; GFX9-NEXT: v_mov_b32_e32 v44, s9 +; GFX9-NEXT: v_mov_b32_e32 v45, s8 +; GFX9-NEXT: v_mov_b32_e32 v46, s7 +; GFX9-NEXT: v_mov_b32_e32 v47, s6 +; GFX9-NEXT: .LBB57_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v28, v47, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v46, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v30, v45, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v31, v44, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v32, v43, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v33, v42, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v34, v41, 16, v34 +; GFX9-NEXT: v_lshl_or_b32 v35, v40, 16, v35 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v36, v55, 16, v36 +; GFX9-NEXT: v_lshl_or_b32 v37, v54, 16, v37 +; GFX9-NEXT: v_lshl_or_b32 v38, v53, 16, v38 +; GFX9-NEXT: v_lshl_or_b32 v39, v52, 16, v39 +; GFX9-NEXT: v_lshl_or_b32 v48, v51, 16, v48 +; GFX9-NEXT: v_lshl_or_b32 v49, v50, 16, v49 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-NEXT: v_mov_b32_e32 v3, v31 +; GFX9-NEXT: v_mov_b32_e32 v4, v32 +; GFX9-NEXT: v_mov_b32_e32 v5, v33 +; GFX9-NEXT: v_mov_b32_e32 v6, v34 +; GFX9-NEXT: v_mov_b32_e32 v7, v35 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v9, v37 +; GFX9-NEXT: v_mov_b32_e32 v10, v38 +; GFX9-NEXT: v_mov_b32_e32 v11, v39 +; GFX9-NEXT: v_mov_b32_e32 v12, v48 +; GFX9-NEXT: v_mov_b32_e32 v13, v49 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v56f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v10, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v11, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v12, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v10, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v13, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v10, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v11, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v12, 16, v1 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v10, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v13, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s14, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s13, s9 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s10, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s9, s7 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v33, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v65.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v67.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v68.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v69.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v71.l +; GFX11-TRUE16-NEXT: s_branch .LBB57_5 +; GFX11-TRUE16-NEXT: .LBB57_3: +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s12 +; GFX11-TRUE16-NEXT: .LBB57_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v34, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v35, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v38, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v23, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v24, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v26, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v27, 16, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v25, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v37, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v50, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v39, 16, v12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v49, 16, v20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v52, 16, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v8.h +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v51, 16, v32 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v53, 16, v54 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v5, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_mov_b32 v7, v37 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v56f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v33, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v34, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v35, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v31, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v30, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v37, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v36, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: s_branch .LBB57_5 +; GFX11-FAKE16-NEXT: .LBB57_3: +; GFX11-FAKE16-NEXT: s_branch .LBB57_2 +; GFX11-FAKE16-NEXT: .LBB57_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v10, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s21 :: v_dual_mov_b32 v29, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s19 :: v_dual_mov_b32 v34, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s17 :: v_dual_mov_b32 v36, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v30, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s1 :: v_dual_mov_b32 v32, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s45 :: v_dual_mov_b32 v39, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s43 :: v_dual_mov_b32 v49, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s41 :: v_dual_mov_b32 v51, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s15 :: v_dual_mov_b32 v53, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s13 :: v_dual_mov_b32 v55, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s11 :: v_dual_mov_b32 v65, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s9 :: v_dual_mov_b32 v67, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s6 :: v_dual_mov_b32 v69, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s5 +; GFX11-FAKE16-NEXT: .LBB57_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v70, 16, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v68, 16, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v66, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v71, 16, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v69, 16, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v64, 16, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v67, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v65, 16, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v55, 16, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v54, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v51, 16, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v49, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v39, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v38, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_mov_b32 v7, v37 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <56 x i16> %a, splat (i16 3) + %a2 = bitcast <56 x i16> %a1 to <56 x half> + br label %end + +cmp.false: + %a3 = bitcast <56 x i16> %a to <56 x half> + br label %end + +end: + %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x half> %phi +} + define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v56f16_to_v56i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v7 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v11 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v12 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v24 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v28 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v30 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v52 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v48 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v35 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v34 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v61 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v62, v40 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v61 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v11 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v15 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v17 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v41, 0x38000000, v56 -; GCN-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; GCN-NEXT: v_add_f32_e32 v47, 0x38000000, v47 -; GCN-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; GCN-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; GCN-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; GCN-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; GCN-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v54, 0x38000000, v60 -; GCN-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v56, 0x38000000, v58 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v58, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v60, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v32 -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v48 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v33 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v22 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_or_b32_e32 v61, v13, v3 -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; GCN-NEXT: v_or_b32_e32 v41, v2, v15 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v57, v62 -; GCN-NEXT: v_mov_b32_e32 v62, v18 -; GCN-NEXT: v_or_b32_e32 v15, v4, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v62 -; GCN-NEXT: v_or_b32_e32 v16, v8, v17 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; GCN-NEXT: v_or_b32_e32 v17, v9, v3 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v31 -; GCN-NEXT: v_or_b32_e32 v18, v10, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; GCN-NEXT: v_or_b32_e32 v19, v11, v4 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32 -; GCN-NEXT: v_or_b32_e32 v20, v20, v8 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; GCN-NEXT: v_or_b32_e32 v21, v21, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; GCN-NEXT: v_or_b32_e32 v14, v14, v2 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v38 -; GCN-NEXT: v_or_b32_e32 v46, v46, v24 -; GCN-NEXT: v_or_b32_e32 v44, v44, v47 -; GCN-NEXT: v_or_b32_e32 v3, v42, v45 -; GCN-NEXT: v_or_b32_e32 v11, v40, v43 -; GCN-NEXT: v_or_b32_e32 v2, v54, v37 -; GCN-NEXT: v_or_b32_e32 v10, v52, v55 -; GCN-NEXT: v_or_b32_e32 v8, v50, v53 -; GCN-NEXT: v_or_b32_e32 v25, v35, v51 -; GCN-NEXT: v_or_b32_e32 v22, v56, v49 -; GCN-NEXT: v_or_b32_e32 v29, v29, v39 -; GCN-NEXT: v_or_b32_e32 v27, v27, v30 -; GCN-NEXT: v_or_b32_e32 v54, v59, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v23, v58 -; GCN-NEXT: v_or_b32_e32 v31, v34, v60 -; GCN-NEXT: v_or_b32_e32 v5, v5, v4 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v1, v48, v26 -; GCN-NEXT: v_alignbit_b32 v56, v1, v24, 16 -; GCN-NEXT: v_alignbit_b32 v47, v6, v47, 16 -; GCN-NEXT: v_alignbit_b32 v45, v7, v45, 16 -; GCN-NEXT: v_alignbit_b32 v43, v5, v43, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v37, 16 -; GCN-NEXT: v_alignbit_b32 v4, v21, v55, 16 -; GCN-NEXT: v_alignbit_b32 v26, v20, v53, 16 -; GCN-NEXT: v_alignbit_b32 v9, v19, v51, 16 -; GCN-NEXT: v_alignbit_b32 v24, v18, v49, 16 -; GCN-NEXT: v_alignbit_b32 v23, v17, v39, 16 -; GCN-NEXT: v_alignbit_b32 v30, v16, v30, 16 -; GCN-NEXT: v_alignbit_b32 v28, v15, v28, 16 -; GCN-NEXT: v_alignbit_b32 v63, v41, v58, 16 -; GCN-NEXT: v_alignbit_b32 v59, v61, v60, 16 -; GCN-NEXT: v_mov_b32_e32 v60, v61 -; GCN-NEXT: v_mov_b32_e32 v61, v31 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v56 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v47 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v45 -; GCN-NEXT: v_or_b32_e32 v42, v35, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v35 -; GCN-NEXT: v_or_b32_e32 v37, v1, v37 -; GCN-NEXT: v_mov_b32_e32 v55, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v43 -; GCN-NEXT: v_or_b32_e32 v43, v39, v49 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v11 -; GCN-NEXT: v_or_b32_e32 v35, v6, v50 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_or_b32_e32 v3, v3, v51 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v55 -; GCN-NEXT: v_or_b32_e32 v7, v7, v52 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v53 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v5, v5, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v2, v2, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v14, v14, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v4, v10, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v62 -; GCN-NEXT: v_or_b32_e32 v21, v21, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v8, v8, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v20, v20, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v9, v25, v9 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_or_b32_e32 v19, v19, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v22, v22, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v18, v18, v44 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v23, v29, v23 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v17, v17, v45 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v63 -; GCN-NEXT: v_or_b32_e32 v27, v27, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v54 -; GCN-NEXT: v_or_b32_e32 v16, v16, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v59 -; GCN-NEXT: v_or_b32_e32 v28, v48, v28 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; GCN-NEXT: v_or_b32_e32 v15, v15, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x6c, v0 -; GCN-NEXT: v_or_b32_e32 v38, v38, v47 -; GCN-NEXT: v_or_b32_e32 v41, v41, v56 -; GCN-NEXT: v_or_b32_e32 v54, v54, v58 -; GCN-NEXT: v_or_b32_e32 v47, v59, v57 -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v56f16_to_v56i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v30 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v34 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v36 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v35, v37 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v38 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v51 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v52 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v54 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v41 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_mov_b32_e32 v51, v9 +; SI-NEXT: v_mov_b32_e32 v52, v11 +; SI-NEXT: v_mov_b32_e32 v54, v13 +; SI-NEXT: v_mov_b32_e32 v41, v12 +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v31, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v38 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v46 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v31 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v38, v46 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v38, v45 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v54 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v7, v31, v45 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v44 +; SI-NEXT: v_or_b32_e32 v41, v37, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v43 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v52 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v54, v39, v43 +; SI-NEXT: v_or_b32_e32 v52, v38, v42 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v48 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v51, v25, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v48, v37, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v62 +; SI-NEXT: v_or_b32_e32 v57, v21, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 +; SI-NEXT: v_or_b32_e32 v62, v25, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v61, v29, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v35 +; SI-NEXT: v_or_b32_e32 v49, v21, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v26 +; SI-NEXT: v_or_b32_e32 v35, v25, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v21 +; SI-NEXT: v_or_b32_e32 v33, v29, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v53 +; SI-NEXT: v_or_b32_e32 v36, v21, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 +; SI-NEXT: v_or_b32_e32 v55, v7, v21 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v50 +; SI-NEXT: v_alignbit_b32 v26, v36, v26, 16 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v50 +; SI-NEXT: v_or_b32_e32 v53, v25, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v58 +; SI-NEXT: v_or_b32_e32 v59, v13, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_alignbit_b32 v29, v59, v28, 16 +; SI-NEXT: v_alignbit_b32 v28, v53, v27, 16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_alignbit_b32 v27, v55, v60, 16 +; SI-NEXT: v_or_b32_e32 v22, v21, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_or_b32_e32 v24, v24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v17 +; SI-NEXT: v_or_b32_e32 v10, v10, v21 +; SI-NEXT: v_alignbit_b32 v44, v10, v43, 16 +; SI-NEXT: v_alignbit_b32 v43, v19, v42, 16 +; SI-NEXT: v_alignbit_b32 v25, v22, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, v15, v30, 16 +; SI-NEXT: v_alignbit_b32 v30, v16, v37, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_or_b32_e32 v5, v5, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_or_b32_e32 v6, v6, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v21 +; SI-NEXT: v_alignbit_b32 v56, v3, v47, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v46, 16 +; SI-NEXT: v_alignbit_b32 v46, v5, v45, 16 +; SI-NEXT: v_alignbit_b32 v45, v1, v31, 16 +; SI-NEXT: v_alignbit_b32 v21, v24, v38, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v56 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v31, v31, v37 +; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v56f16_to_v56i16: ; VI: ; %bb.0: @@ -23308,7 +48562,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_add_f16_e32 v56, 0x200, v56 @@ -23366,7 +48620,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 ; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 ; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v56 ; VI-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -23480,7 +48734,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v27, v56, v27, s6 @@ -23568,7 +48822,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v26 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v27 -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v19, v40, v19, s4 @@ -23619,7 +48873,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] @@ -23649,7 +48903,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -23689,7 +48943,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v27, v80, v27, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v26, v71, v26, 0x5040100 @@ -23775,7 +49029,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v26 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v27 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: .LBB58_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v29, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v30, v1, 0x5040100 @@ -23822,3 +49076,1521 @@ end: %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <56 x i16> %phi } + +define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56f16_to_v56i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v45 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v57 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v60 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v61 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v56, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v47, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v46, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v45, s29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v33, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v47 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v43 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v44 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v41 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v54 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_or_b32_e32 v57, v54, v29 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v6, v6, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v16, v16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v15, v15, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_or_b32_e32 v34, v34, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_or_b32_e32 v37, v37, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v36, v36, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_or_b32_e32 v51, v51, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v52, v52, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_or_b32_e32 v20, v20, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_or_b32_e32 v27, v27, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v54 +; SI-NEXT: v_or_b32_e32 v28, v28, v47 +; SI-NEXT: v_or_b32_e32 v26, v26, v46 +; SI-NEXT: v_or_b32_e32 v23, v23, v45 +; SI-NEXT: v_or_b32_e32 v22, v22, v33 +; SI-NEXT: v_or_b32_e32 v50, v50, v43 +; SI-NEXT: v_or_b32_e32 v48, v48, v42 +; SI-NEXT: v_or_b32_e32 v38, v38, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v40 +; SI-NEXT: v_or_b32_e32 v18, v18, v55 +; SI-NEXT: v_or_b32_e32 v17, v17, v59 +; SI-NEXT: v_or_b32_e32 v12, v12, v53 +; SI-NEXT: v_or_b32_e32 v10, v10, v30 +; SI-NEXT: v_or_b32_e32 v8, v8, v60 +; SI-NEXT: v_alignbit_b32 v56, v25, v47, 16 +; SI-NEXT: v_alignbit_b32 v47, v27, v46, 16 +; SI-NEXT: v_alignbit_b32 v46, v20, v45, 16 +; SI-NEXT: v_alignbit_b32 v45, v52, v33, 16 +; SI-NEXT: v_alignbit_b32 v44, v51, v43, 16 +; SI-NEXT: v_alignbit_b32 v43, v36, v42, 16 +; SI-NEXT: v_alignbit_b32 v42, v37, v58, 16 +; SI-NEXT: v_alignbit_b32 v41, v34, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, v15, v55, 16 +; SI-NEXT: v_alignbit_b32 v55, v16, v59, 16 +; SI-NEXT: v_alignbit_b32 v54, v13, v53, 16 +; SI-NEXT: v_alignbit_b32 v53, v6, v30, 16 +; SI-NEXT: v_alignbit_b32 v30, v7, v60, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v29, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v28, v28, v33 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v47 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v24, v21 +; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v21, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v46 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v45 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v49 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v31 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v3, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v3, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v54 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v56f16_to_v56i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v50, 0x200 +; VI-NEXT: v_add_f16_e32 v28, s16, v50 +; VI-NEXT: v_add_f16_e32 v47, s43, v50 +; VI-NEXT: v_add_f16_e32 v29, s17, v50 +; VI-NEXT: v_add_f16_e32 v46, s42, v50 +; VI-NEXT: v_add_f16_e32 v30, s18, v50 +; VI-NEXT: v_add_f16_e32 v45, s41, v50 +; VI-NEXT: v_add_f16_e32 v31, s19, v50 +; VI-NEXT: v_add_f16_e32 v44, s40, v50 +; VI-NEXT: v_add_f16_e32 v32, s20, v50 +; VI-NEXT: v_add_f16_e32 v43, s15, v50 +; VI-NEXT: v_add_f16_e32 v33, s21, v50 +; VI-NEXT: v_add_f16_e32 v42, s14, v50 +; VI-NEXT: v_add_f16_e32 v34, s22, v50 +; VI-NEXT: v_add_f16_e32 v41, s13, v50 +; VI-NEXT: v_add_f16_e32 v35, s23, v50 +; VI-NEXT: v_add_f16_e32 v40, s12, v50 +; VI-NEXT: v_add_f16_e32 v36, s24, v50 +; VI-NEXT: v_add_f16_e32 v55, s11, v50 +; VI-NEXT: v_add_f16_e32 v37, s25, v50 +; VI-NEXT: v_add_f16_e32 v54, s10, v50 +; VI-NEXT: v_add_f16_e32 v38, s26, v50 +; VI-NEXT: v_add_f16_e32 v53, s9, v50 +; VI-NEXT: v_add_f16_e32 v39, s27, v50 +; VI-NEXT: v_add_f16_e32 v52, s8, v50 +; VI-NEXT: v_add_f16_e32 v48, s28, v50 +; VI-NEXT: v_add_f16_e32 v51, s7, v50 +; VI-NEXT: v_add_f16_e32 v49, s29, v50 +; VI-NEXT: v_add_f16_e32 v50, s6, v50 +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: s_branch .LBB59_5 +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v50, s6 +; VI-NEXT: v_mov_b32_e32 v49, s29 +; VI-NEXT: v_mov_b32_e32 v51, s7 +; VI-NEXT: v_mov_b32_e32 v48, s28 +; VI-NEXT: v_mov_b32_e32 v52, s8 +; VI-NEXT: v_mov_b32_e32 v39, s27 +; VI-NEXT: v_mov_b32_e32 v53, s9 +; VI-NEXT: v_mov_b32_e32 v38, s26 +; VI-NEXT: v_mov_b32_e32 v54, s10 +; VI-NEXT: v_mov_b32_e32 v37, s25 +; VI-NEXT: v_mov_b32_e32 v55, s11 +; VI-NEXT: v_mov_b32_e32 v36, s24 +; VI-NEXT: v_mov_b32_e32 v40, s12 +; VI-NEXT: v_mov_b32_e32 v35, s23 +; VI-NEXT: v_mov_b32_e32 v41, s13 +; VI-NEXT: v_mov_b32_e32 v34, s22 +; VI-NEXT: v_mov_b32_e32 v42, s14 +; VI-NEXT: v_mov_b32_e32 v33, s21 +; VI-NEXT: v_mov_b32_e32 v43, s15 +; VI-NEXT: v_mov_b32_e32 v32, s20 +; VI-NEXT: v_mov_b32_e32 v44, s40 +; VI-NEXT: v_mov_b32_e32 v31, s19 +; VI-NEXT: v_mov_b32_e32 v45, s41 +; VI-NEXT: v_mov_b32_e32 v30, s18 +; VI-NEXT: v_mov_b32_e32 v46, s42 +; VI-NEXT: v_mov_b32_e32 v29, s17 +; VI-NEXT: v_mov_b32_e32 v47, s43 +; VI-NEXT: v_mov_b32_e32 v28, s16 +; VI-NEXT: .LBB59_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v28, v28, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v30, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v32, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v33, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v34, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v35, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; VI-NEXT: v_or_b32_sdwa v36, v36, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v37, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v38, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v39, v39, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v48, v48, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v49, v49, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v28 +; VI-NEXT: v_mov_b32_e32 v1, v29 +; VI-NEXT: v_mov_b32_e32 v2, v30 +; VI-NEXT: v_mov_b32_e32 v3, v31 +; VI-NEXT: v_mov_b32_e32 v4, v32 +; VI-NEXT: v_mov_b32_e32 v5, v33 +; VI-NEXT: v_mov_b32_e32 v6, v34 +; VI-NEXT: v_mov_b32_e32 v7, v35 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v10, v38 +; VI-NEXT: v_mov_b32_e32 v11, v39 +; VI-NEXT: v_mov_b32_e32 v12, v48 +; VI-NEXT: v_mov_b32_e32 v13, v49 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v56f16_to_v56i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v27, 16, v13 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v12, v26, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v11, v25, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v10, v24, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v23, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v8, v22, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_mov_b32_e32 v14, 0x200 +; GFX9-NEXT: v_pk_add_f16 v49, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_f16 v48, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_f16 v39, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_f16 v38, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_f16 v37, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_f16 v36, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_f16 v35, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_f16 v34, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_f16 v33, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_f16 v32, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_f16 v31, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_pk_add_f16 v30, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_pk_add_f16 v29, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_f16 v28, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: s_branch .LBB59_5 +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v49, s29 +; GFX9-NEXT: v_mov_b32_e32 v48, s28 +; GFX9-NEXT: v_mov_b32_e32 v39, s27 +; GFX9-NEXT: v_mov_b32_e32 v38, s26 +; GFX9-NEXT: v_mov_b32_e32 v37, s25 +; GFX9-NEXT: v_mov_b32_e32 v36, s24 +; GFX9-NEXT: v_mov_b32_e32 v35, s23 +; GFX9-NEXT: v_mov_b32_e32 v34, s22 +; GFX9-NEXT: v_mov_b32_e32 v33, s21 +; GFX9-NEXT: v_mov_b32_e32 v32, s20 +; GFX9-NEXT: v_mov_b32_e32 v31, s19 +; GFX9-NEXT: v_mov_b32_e32 v30, s18 +; GFX9-NEXT: v_mov_b32_e32 v29, s17 +; GFX9-NEXT: v_mov_b32_e32 v28, s16 +; GFX9-NEXT: v_mov_b32_e32 v50, s43 +; GFX9-NEXT: v_mov_b32_e32 v51, s42 +; GFX9-NEXT: v_mov_b32_e32 v52, s41 +; GFX9-NEXT: v_mov_b32_e32 v53, s40 +; GFX9-NEXT: v_mov_b32_e32 v54, s15 +; GFX9-NEXT: v_mov_b32_e32 v55, s14 +; GFX9-NEXT: v_mov_b32_e32 v40, s13 +; GFX9-NEXT: v_mov_b32_e32 v41, s12 +; GFX9-NEXT: v_mov_b32_e32 v42, s11 +; GFX9-NEXT: v_mov_b32_e32 v43, s10 +; GFX9-NEXT: v_mov_b32_e32 v44, s9 +; GFX9-NEXT: v_mov_b32_e32 v45, s8 +; GFX9-NEXT: v_mov_b32_e32 v46, s7 +; GFX9-NEXT: v_mov_b32_e32 v47, s6 +; GFX9-NEXT: .LBB59_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v28, v47, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v46, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v30, v45, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v31, v44, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v32, v43, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v33, v42, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v34, v41, 16, v34 +; GFX9-NEXT: v_lshl_or_b32 v35, v40, 16, v35 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v36, v55, 16, v36 +; GFX9-NEXT: v_lshl_or_b32 v37, v54, 16, v37 +; GFX9-NEXT: v_lshl_or_b32 v38, v53, 16, v38 +; GFX9-NEXT: v_lshl_or_b32 v39, v52, 16, v39 +; GFX9-NEXT: v_lshl_or_b32 v48, v51, 16, v48 +; GFX9-NEXT: v_lshl_or_b32 v49, v50, 16, v49 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-NEXT: v_mov_b32_e32 v3, v31 +; GFX9-NEXT: v_mov_b32_e32 v4, v32 +; GFX9-NEXT: v_mov_b32_e32 v5, v33 +; GFX9-NEXT: v_mov_b32_e32 v6, v34 +; GFX9-NEXT: v_mov_b32_e32 v7, v35 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v9, v37 +; GFX9-NEXT: v_mov_b32_e32 v10, v38 +; GFX9-NEXT: v_mov_b32_e32 v11, v39 +; GFX9-NEXT: v_mov_b32_e32 v12, v48 +; GFX9-NEXT: v_mov_b32_e32 v13, v49 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v56i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v10, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v11, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v12, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v10, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v13, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v10, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v11, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v12, 16, v1 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v10, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v13, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s14, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s13, s9 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s10, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s9, s7 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v33, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v65.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v67.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v68.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v69.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v71.l +; GFX11-TRUE16-NEXT: s_branch .LBB59_5 +; GFX11-TRUE16-NEXT: .LBB59_3: +; GFX11-TRUE16-NEXT: s_branch .LBB59_2 +; GFX11-TRUE16-NEXT: .LBB59_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s12 +; GFX11-TRUE16-NEXT: .LBB59_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v34, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v35, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v38, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v23, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v24, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v26, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v27, 16, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v25, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v37, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v50, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v39, 16, v12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v49, 16, v20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v52, 16, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v8.h +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v51, 16, v32 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v53, 16, v54 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v5, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_mov_b32 v7, v37 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v56i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v33, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v34, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v35, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v31, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v30, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v37, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v36, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: s_branch .LBB59_5 +; GFX11-FAKE16-NEXT: .LBB59_3: +; GFX11-FAKE16-NEXT: s_branch .LBB59_2 +; GFX11-FAKE16-NEXT: .LBB59_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v10, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s21 :: v_dual_mov_b32 v29, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s19 :: v_dual_mov_b32 v34, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s17 :: v_dual_mov_b32 v36, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v30, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s1 :: v_dual_mov_b32 v32, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s45 :: v_dual_mov_b32 v39, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s43 :: v_dual_mov_b32 v49, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s41 :: v_dual_mov_b32 v51, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s15 :: v_dual_mov_b32 v53, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s13 :: v_dual_mov_b32 v55, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s11 :: v_dual_mov_b32 v65, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s9 :: v_dual_mov_b32 v67, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s6 :: v_dual_mov_b32 v69, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s5 +; GFX11-FAKE16-NEXT: .LBB59_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v70, 16, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v68, 16, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v66, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v71, 16, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v69, 16, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v64, 16, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v67, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v65, 16, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v55, 16, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v54, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v51, 16, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v49, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v39, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v38, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_mov_b32 v7, v37 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <56 x half> %a, splat (half 0xH0200) + %a2 = bitcast <56 x half> %a1 to <56 x i16> + br label %end + +cmp.false: + %a3 = bitcast <56 x half> %a to <56 x i16> + br label %end + +end: + %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x i16> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 2837f2b2bd7fa..97e880e1bf488 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -1,54 +1,54 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <30 x float> @bitcast_v30i32_to_v30f32(<30 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v30i32_to_v30f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v30i32_to_v30f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v30i32_to_v30f32: ; VI: ; %bb.0: @@ -196,49 +196,354 @@ end: ret <30 x float> %phi } +define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30i32_to_v30f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v30i32_to_v30f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v30i32_to_v30f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v30i32_to_v30f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_3: +; GFX11-NEXT: .LBB1_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <30 x i32> %a, splat (i32 3) + %a2 = bitcast <30 x i32> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <30 x i32> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v30f32_to_v30i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v30f32_to_v30i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v30f32_to_v30i32: ; VI: ; %bb.0: @@ -247,7 +552,7 @@ define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 @@ -279,7 +584,7 @@ define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -290,7 +595,7 @@ define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 ; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 @@ -322,7 +627,7 @@ define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -334,7 +639,7 @@ define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 ; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 @@ -351,7 +656,7 @@ define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -371,49 +676,339 @@ end: ret <30 x i32> %phi } +define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30f32_to_v30i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v30f32_to_v30i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v30f32_to_v30i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v30f32_to_v30i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB3_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: .LBB3_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <30 x float> %a1 to <30 x i32> + br label %end + +cmp.false: + %a3 = bitcast <30 x float> %a to <30 x i32> + br label %end + +end: + %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x i32> %phi +} + define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v30i32_to_v15i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v30i32_to_v15i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v30i32_to_v15i64: ; VI: ; %bb.0: @@ -422,7 +1017,7 @@ define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 @@ -454,7 +1049,7 @@ define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -465,7 +1060,7 @@ define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 ; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 @@ -497,7 +1092,7 @@ define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -509,7 +1104,7 @@ define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 @@ -541,7 +1136,7 @@ define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -561,63 +1156,368 @@ end: ret <15 x i64> %phi } -define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v15i64_to_v30i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30i32_to_v15i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 ; -; VI-LABEL: bitcast_v15i64_to_v30i32: +; VI-LABEL: bitcast_v30i32_to_v15i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v30i32_to_v15i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v30i32_to_v15i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_3: +; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <30 x i32> %a, splat (i32 3) + %a2 = bitcast <30 x i32> %a1 to <15 x i64> + br label %end + +cmp.false: + %a3 = bitcast <30 x i32> %a to <15 x i64> + br label %end + +end: + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi +} + +define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v15i64_to_v30i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15i64_to_v30i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 ; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 @@ -644,7 +1544,7 @@ define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -655,7 +1555,7 @@ define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 ; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc @@ -687,7 +1587,7 @@ define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -699,7 +1599,7 @@ define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -739,7 +1639,7 @@ define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -759,282 +1659,450 @@ end: ret <30 x i32> %phi } -define <15 x double> @bitcast_v30i32_to_v15f64(<30 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v30i32_to_v15f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15i64_to_v30i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 ; -; VI-LABEL: bitcast_v30i32_to_v15f64: +; VI-LABEL: bitcast_v15i64_to_v30i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 ; -; GFX9-LABEL: bitcast_v30i32_to_v15f64: +; GFX9-LABEL: bitcast_v15i64_to_v30i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 -; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 -; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 -; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 -; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 -; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 -; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 -; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 -; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 -; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 ; -; GFX11-LABEL: bitcast_v30i32_to_v15f64: +; GFX11-LABEL: bitcast_v15i64_to_v30i32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB7_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: .LBB7_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <30 x i32> %a, splat (i32 3) - %a2 = bitcast <30 x i32> %a1 to <15 x double> + %a1 = add <15 x i64> %a, splat (i64 3) + %a2 = bitcast <15 x i64> %a1 to <30 x i32> br label %end cmp.false: - %a3 = bitcast <30 x i32> %a to <15 x double> + %a3 = bitcast <15 x i64> %a to <30 x i32> br label %end end: - %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <15 x double> %phi + %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x i32> %phi } -define <30 x i32> @bitcast_v15f64_to_v30i32(<15 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v15f64_to_v30i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define <15 x double> @bitcast_v30i32_to_v15f64(<30 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v30i32_to_v15f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v15f64_to_v30i32: +; VI-LABEL: bitcast_v30i32_to_v15f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v15f64_to_v30i32: +; GFX9-LABEL: bitcast_v30i32_to_v15f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v15f64_to_v30i32: +; GFX11-LABEL: bitcast_v30i32_to_v15f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -1042,456 +2110,173 @@ define <30 x i32> @bitcast_v15f64_to_v30i32(<15 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <15 x double> %a1 to <30 x i32> + %a1 = add <30 x i32> %a, splat (i32 3) + %a2 = bitcast <30 x i32> %a1 to <15 x double> br label %end cmp.false: - %a3 = bitcast <15 x double> %a to <30 x i32> + %a3 = bitcast <30 x i32> %a to <15 x double> br label %end end: - %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <30 x i32> %phi + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi } -define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v30i32_to_v60i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v38, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v48, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v51, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v53, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v44, v6, v5, 16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_alignbit_b32 v46, v4, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v38, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v48, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v51, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v53, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v44, v6, v5, 16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_alignbit_b32 v46, v4, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v56 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v60 -; GCN-NEXT: v_or_b32_e32 v2, v2, v56 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v3, v3, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; GCN-NEXT: v_or_b32_e32 v4, v4, v59 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v5, v5, v44 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v6, v6, v58 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v7, v7, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; GCN-NEXT: v_or_b32_e32 v8, v8, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v9, v9, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_or_b32_e32 v10, v10, v47 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v11, v11, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_or_b32_e32 v12, v12, v45 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v13, v13, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v14, v14, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v15, v15, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v16, v16, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v17, v17, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v18, v18, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v19, v19, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v20, v20, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v21, v21, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v22, v22, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v24, v24, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v25, v25, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v26, v26, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v27, v27, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v28, v28, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v29, v29, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v30, v30, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30i32_to_v15f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 ; -; VI-LABEL: bitcast_v30i32_to_v60i16: +; VI-LABEL: bitcast_v30i32_to_v15f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB6_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_4 -; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 ; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 @@ -1505,7 +2290,7 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 ; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 ; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 @@ -1522,197 +2307,51 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB6_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 ; -; GFX9-LABEL: bitcast_v30i32_to_v60i16: +; GFX9-LABEL: bitcast_v30i32_to_v15f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB6_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 ; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 ; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 @@ -1726,7 +2365,7 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 ; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 ; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 ; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 ; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 ; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 @@ -1743,989 +2382,842 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB6_4: ; %end +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v30i32_to_v15f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB9_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: .LBB9_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <30 x i32> %a, splat (i32 3) + %a2 = bitcast <30 x i32> %a1 to <15 x double> + br label %end + +cmp.false: + %a3 = bitcast <30 x i32> %a to <15 x double> + br label %end + +end: + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi +} + +define <30 x i32> @bitcast_v15f64_to_v30i32(<15 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v15f64_to_v30i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15f64_to_v30i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v15f64_to_v30i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 -; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v30i32_to_v60i16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB6_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: bitcast_v15f64_to_v30i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <30 x i32> + br label %end + +cmp.false: + %a3 = bitcast <15 x double> %a to <30 x i32> + br label %end + +end: + %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x i32> %phi +} + +define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15f64_to_v30i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v31, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: v_mov_b32_e32 v17, v31 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 ; -; GFX11-FAKE16-LABEL: bitcast_v30i32_to_v60i16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; VI-LABEL: bitcast_v15f64_to_v30i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v31, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: v_mov_b32_e32 v17, v31 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v15f64_to_v30i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: v_mov_b32_e32 v17, v31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v15f64_to_v30i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB11_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: .LBB11_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <30 x i32> %a, splat (i32 3) - %a2 = bitcast <30 x i32> %a1 to <60 x i16> + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <30 x i32> br label %end cmp.false: - %a3 = bitcast <30 x i32> %a to <60 x i16> + %a3 = bitcast <15 x double> %a to <30 x i32> br label %end end: - %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <60 x i16> %phi + %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x i32> %phi } -define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v60i16_to_v30i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v34 -; GCN-NEXT: v_or_b32_e32 v1, v1, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v41 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v63 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v62 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v61 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v60 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v59 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v58 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v57 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v47 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v44 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v43 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v56 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v30 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: .LBB7_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v56 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v41, v4 -; GCN-NEXT: v_or_b32_e32 v5, v63, v5 -; GCN-NEXT: v_or_b32_e32 v6, v62, v6 -; GCN-NEXT: v_or_b32_e32 v7, v61, v7 -; GCN-NEXT: v_or_b32_e32 v8, v60, v8 -; GCN-NEXT: v_or_b32_e32 v9, v59, v9 -; GCN-NEXT: v_or_b32_e32 v10, v58, v10 -; GCN-NEXT: v_or_b32_e32 v11, v57, v11 -; GCN-NEXT: v_or_b32_e32 v12, v47, v12 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v30, v13 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v30, v14 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v30, v15 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v30, v18 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v30, v19 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v30, v20 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v30, v21 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v30, v22 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v30, v23 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v30, v24 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v30, v25 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v30, v26 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v30, v27 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v30, v29 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 -; GCN-NEXT: .LBB7_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v30i32_to_v60i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v48, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v51, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v53, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v44, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v48, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v51, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v53, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v44, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v56 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v60i16_to_v30i32: +; VI-LABEL: bitcast_v30i32_to_v60i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v29 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v34, v27 -; VI-NEXT: v_mov_b32_e32 v35, v26 -; VI-NEXT: v_mov_b32_e32 v36, v25 -; VI-NEXT: v_mov_b32_e32 v37, v24 -; VI-NEXT: v_mov_b32_e32 v38, v23 -; VI-NEXT: v_mov_b32_e32 v39, v22 -; VI-NEXT: v_mov_b32_e32 v48, v21 -; VI-NEXT: v_mov_b32_e32 v49, v20 -; VI-NEXT: v_mov_b32_e32 v50, v19 -; VI-NEXT: v_mov_b32_e32 v51, v18 -; VI-NEXT: v_mov_b32_e32 v52, v17 -; VI-NEXT: v_mov_b32_e32 v53, v16 -; VI-NEXT: v_mov_b32_e32 v54, v15 -; VI-NEXT: v_mov_b32_e32 v55, v14 -; VI-NEXT: v_mov_b32_e32 v40, v13 -; VI-NEXT: v_mov_b32_e32 v41, v12 -; VI-NEXT: v_mov_b32_e32 v42, v11 -; VI-NEXT: v_mov_b32_e32 v43, v10 -; VI-NEXT: v_mov_b32_e32 v44, v9 -; VI-NEXT: v_mov_b32_e32 v45, v8 -; VI-NEXT: v_mov_b32_e32 v46, v7 -; VI-NEXT: v_mov_b32_e32 v47, v6 -; VI-NEXT: v_mov_b32_e32 v56, v5 -; VI-NEXT: v_mov_b32_e32 v57, v4 -; VI-NEXT: v_mov_b32_e32 v58, v3 -; VI-NEXT: v_mov_b32_e32 v59, v2 -; VI-NEXT: v_mov_b32_e32 v60, v1 -; VI-NEXT: v_mov_b32_e32 v61, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v29, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 @@ -2754,268 +3246,199 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_4 +; VI-NEXT: s_cbranch_execz .LBB12_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v29, 3 -; VI-NEXT: v_add_u16_e32 v0, 3, v61 -; VI-NEXT: v_add_u16_sdwa v1, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v60 -; VI-NEXT: v_add_u16_sdwa v3, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: v_add_u16_e32 v2, 3, v59 -; VI-NEXT: v_add_u16_sdwa v3, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v3, 3, v58 -; VI-NEXT: v_add_u16_sdwa v4, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_add_u16_e32 v4, 3, v57 -; VI-NEXT: v_add_u16_sdwa v5, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_add_u16_e32 v5, 3, v56 -; VI-NEXT: v_add_u16_sdwa v6, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v5, v6 -; VI-NEXT: v_add_u16_e32 v6, 3, v47 -; VI-NEXT: v_add_u16_sdwa v7, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_add_u16_e32 v7, 3, v46 -; VI-NEXT: v_add_u16_sdwa v8, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v45 -; VI-NEXT: v_add_u16_sdwa v9, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v44 -; VI-NEXT: v_add_u16_sdwa v10, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v9, v10 -; VI-NEXT: v_add_u16_e32 v10, 3, v43 -; VI-NEXT: v_add_u16_sdwa v11, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v42 -; VI-NEXT: v_add_u16_sdwa v12, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v41 -; VI-NEXT: v_add_u16_sdwa v13, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v12, v13 -; VI-NEXT: v_add_u16_e32 v13, 3, v40 -; VI-NEXT: v_add_u16_sdwa v14, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v55 -; VI-NEXT: v_add_u16_sdwa v15, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: v_add_u16_e32 v15, 3, v54 -; VI-NEXT: v_add_u16_sdwa v16, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: v_add_u16_e32 v16, 3, v53 -; VI-NEXT: v_add_u16_sdwa v17, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v16, v16, v17 -; VI-NEXT: v_add_u16_e32 v17, 3, v52 -; VI-NEXT: v_add_u16_sdwa v18, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v17, v17, v18 -; VI-NEXT: v_add_u16_e32 v18, 3, v51 -; VI-NEXT: v_add_u16_sdwa v19, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v18, v18, v19 -; VI-NEXT: v_add_u16_e32 v19, 3, v50 -; VI-NEXT: v_add_u16_sdwa v20, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: v_add_u16_e32 v20, 3, v49 -; VI-NEXT: v_add_u16_sdwa v21, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v20, v20, v21 -; VI-NEXT: v_add_u16_e32 v21, 3, v48 -; VI-NEXT: v_add_u16_sdwa v22, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: v_add_u16_e32 v22, 3, v39 -; VI-NEXT: v_add_u16_sdwa v23, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v22, v22, v23 -; VI-NEXT: v_add_u16_e32 v23, 3, v38 -; VI-NEXT: v_add_u16_sdwa v24, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v23, v23, v24 -; VI-NEXT: v_add_u16_e32 v24, 3, v37 -; VI-NEXT: v_add_u16_sdwa v25, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: v_add_u16_e32 v25, 3, v36 -; VI-NEXT: v_add_u16_sdwa v26, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v25, v25, v26 -; VI-NEXT: v_add_u16_e32 v26, 3, v35 -; VI-NEXT: v_add_u16_sdwa v27, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v26, v26, v27 -; VI-NEXT: v_add_u16_e32 v27, 3, v34 -; VI-NEXT: v_add_u16_sdwa v28, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v27, v27, v28 -; VI-NEXT: v_add_u16_e32 v28, 3, v33 -; VI-NEXT: v_add_u16_sdwa v30, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v28, v28, v30 -; VI-NEXT: v_add_u16_e32 v30, 3, v32 -; VI-NEXT: v_add_u16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v29, v30, v29 -; VI-NEXT: .LBB7_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v60i16_to_v30i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v39, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v48, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: v_mov_b32_e32 v43, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_mov_b32_e32 v44, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v8 -; GFX9-NEXT: v_mov_b32_e32 v46, v7 -; GFX9-NEXT: v_mov_b32_e32 v47, v6 -; GFX9-NEXT: v_mov_b32_e32 v56, v5 -; GFX9-NEXT: v_mov_b32_e32 v57, v4 -; GFX9-NEXT: v_mov_b32_e32 v58, v3 -; GFX9-NEXT: v_mov_b32_e32 v59, v2 -; GFX9-NEXT: v_mov_b32_e32 v60, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB12_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v30i32_to_v60i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 @@ -3037,233 +3460,162 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_4: ; %end +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 +; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30i32: +; GFX11-TRUE16-LABEL: bitcast_v30i32_to_v60i16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -3271,2275 +3623,2044 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB12_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30i32: +; GFX11-FAKE16-LABEL: bitcast_v30i32_to_v60i16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <60 x i16> %a, splat (i16 3) - %a2 = bitcast <60 x i16> %a1 to <30 x i32> + %a1 = add <30 x i32> %a, splat (i32 3) + %a2 = bitcast <30 x i32> %a1 to <60 x i16> br label %end cmp.false: - %a3 = bitcast <60 x i16> %a to <30 x i32> + %a3 = bitcast <30 x i32> %a to <60 x i16> br label %end end: - %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <30 x i32> %phi + %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x i16> %phi } -define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v30i32_to_v60f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v36 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v50 -; GCN-NEXT: v_mov_b32_e32 v50, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v61 -; GCN-NEXT: v_mov_b32_e32 v61, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v63 -; GCN-NEXT: v_mov_b32_e32 v63, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v46 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v43 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v42 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v41 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v54 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v52 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v51 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v49 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v37 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v35 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v35, v30 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v33, v36, v33 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v37, v31 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v62 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v48, v34 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v49, v32 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v61 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v63 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v47, v46 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v56, v57, v56 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v58, v59, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30i32_to_v60i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_readfirstlane_b32 s45, v1 +; SI-NEXT: v_readfirstlane_b32 s44, v2 +; SI-NEXT: v_readfirstlane_b32 s43, v3 +; SI-NEXT: v_readfirstlane_b32 s42, v4 +; SI-NEXT: v_readfirstlane_b32 s41, v5 +; SI-NEXT: v_readfirstlane_b32 s40, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v7 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s13, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s11, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v12 +; SI-NEXT: v_readfirstlane_b32 s9, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s45 +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s20 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s29, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s27, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s25, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s23, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s21, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s19, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 16 +; SI-NEXT: s_lshr_b32 s46, s6, 16 +; SI-NEXT: s_lshr_b32 s47, s8, 16 +; SI-NEXT: s_lshr_b32 s56, s10, 16 +; SI-NEXT: s_lshr_b32 s57, s12, 16 +; SI-NEXT: s_lshr_b32 s58, s14, 16 +; SI-NEXT: s_lshr_b32 s59, s40, 16 +; SI-NEXT: s_lshr_b32 s60, s42, 16 +; SI-NEXT: s_lshr_b32 s61, s44, 16 +; SI-NEXT: s_lshr_b32 s62, s29, 16 +; SI-NEXT: s_lshr_b32 s63, s27, 16 +; SI-NEXT: s_lshr_b32 s72, s25, 16 +; SI-NEXT: s_lshr_b32 s73, s23, 16 +; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s45 +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s20 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s29, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s27, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s25, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s23, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s21, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s19, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 16 +; SI-NEXT: s_lshr_b32 s46, s6, 16 +; SI-NEXT: s_lshr_b32 s47, s8, 16 +; SI-NEXT: s_lshr_b32 s56, s10, 16 +; SI-NEXT: s_lshr_b32 s57, s12, 16 +; SI-NEXT: s_lshr_b32 s58, s14, 16 +; SI-NEXT: s_lshr_b32 s59, s40, 16 +; SI-NEXT: s_lshr_b32 s60, s42, 16 +; SI-NEXT: s_lshr_b32 s61, s44, 16 +; SI-NEXT: s_lshr_b32 s62, s29, 16 +; SI-NEXT: s_lshr_b32 s63, s27, 16 +; SI-NEXT: s_lshr_b32 s72, s25, 16 +; SI-NEXT: s_lshr_b32 s73, s23, 16 +; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v16, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s75, 16 +; SI-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v13, s4, v13 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 +; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v14, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s45, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s44, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s42, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: s_branch .LBB13_2 ; -; VI-LABEL: bitcast_v30i32_to_v60f16: +; VI-LABEL: bitcast_v30i32_to_v60i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v30, s30, 0 +; VI-NEXT: v_writelane_b32 v30, s31, 1 +; VI-NEXT: v_writelane_b32 v30, s34, 2 +; VI-NEXT: v_writelane_b32 v30, s35, 3 +; VI-NEXT: v_writelane_b32 v30, s36, 4 +; VI-NEXT: v_writelane_b32 v30, s37, 5 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_writelane_b32 v30, s38, 6 +; VI-NEXT: v_readfirstlane_b32 s45, v0 +; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: v_readfirstlane_b32 s43, v2 +; VI-NEXT: v_readfirstlane_b32 s42, v3 +; VI-NEXT: v_readfirstlane_b32 s41, v4 +; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s15, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s13, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s11, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s9, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s6, v14 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v15 +; VI-NEXT: v_writelane_b32 v30, s39, 7 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB8_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB8_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s57, s9, 16 +; VI-NEXT: s_lshr_b32 s58, s10, 16 +; VI-NEXT: s_lshr_b32 s59, s11, 16 +; VI-NEXT: s_lshr_b32 s60, s12, 16 +; VI-NEXT: s_lshr_b32 s61, s13, 16 +; VI-NEXT: s_lshr_b32 s62, s14, 16 +; VI-NEXT: s_lshr_b32 s63, s15, 16 +; VI-NEXT: s_lshr_b32 s72, s40, 16 +; VI-NEXT: s_lshr_b32 s73, s41, 16 +; VI-NEXT: s_lshr_b32 s74, s42, 16 +; VI-NEXT: s_lshr_b32 s75, s43, 16 +; VI-NEXT: s_lshr_b32 s76, s44, 16 +; VI-NEXT: s_lshr_b32 s77, s45, 16 +; VI-NEXT: s_lshr_b32 s78, s29, 16 +; VI-NEXT: s_lshr_b32 s79, s28, 16 +; VI-NEXT: s_lshr_b32 s88, s27, 16 +; VI-NEXT: s_lshr_b32 s89, s26, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 16 +; VI-NEXT: s_lshr_b32 s30, s23, 16 +; VI-NEXT: s_lshr_b32 s31, s22, 16 +; VI-NEXT: s_lshr_b32 s34, s21, 16 +; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s37, s18, 16 +; VI-NEXT: s_lshr_b32 s38, s17, 16 +; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s57, s9, 16 +; VI-NEXT: s_lshr_b32 s58, s10, 16 +; VI-NEXT: s_lshr_b32 s59, s11, 16 +; VI-NEXT: s_lshr_b32 s60, s12, 16 +; VI-NEXT: s_lshr_b32 s61, s13, 16 +; VI-NEXT: s_lshr_b32 s62, s14, 16 +; VI-NEXT: s_lshr_b32 s63, s15, 16 +; VI-NEXT: s_lshr_b32 s72, s40, 16 +; VI-NEXT: s_lshr_b32 s73, s41, 16 +; VI-NEXT: s_lshr_b32 s74, s42, 16 +; VI-NEXT: s_lshr_b32 s75, s43, 16 +; VI-NEXT: s_lshr_b32 s76, s44, 16 +; VI-NEXT: s_lshr_b32 s77, s45, 16 +; VI-NEXT: s_lshr_b32 s78, s29, 16 +; VI-NEXT: s_lshr_b32 s79, s28, 16 +; VI-NEXT: s_lshr_b32 s88, s27, 16 +; VI-NEXT: s_lshr_b32 s89, s26, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 16 +; VI-NEXT: s_lshr_b32 s30, s23, 16 +; VI-NEXT: s_lshr_b32 s31, s22, 16 +; VI-NEXT: s_lshr_b32 s34, s21, 16 +; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s37, s18, 16 +; VI-NEXT: s_lshr_b32 s38, s17, 16 +; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s39, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s38, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s37, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s36, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s35, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s34, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s31, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s30, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s91, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s90, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s89, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s88, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s79, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s78, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s45 +; VI-NEXT: s_lshl_b32 s29, s77, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s44 +; VI-NEXT: s_lshl_b32 s44, s76, 16 +; VI-NEXT: s_or_b32 s29, s29, s44 +; VI-NEXT: s_and_b32 s43, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s44, s75, 16 +; VI-NEXT: s_or_b32 s43, s43, s44 +; VI-NEXT: s_and_b32 s42, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s44, s74, 16 +; VI-NEXT: s_or_b32 s42, s42, s44 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s44, s73, 16 +; VI-NEXT: s_or_b32 s41, s41, s44 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s44, s72, 16 +; VI-NEXT: s_or_b32 s40, s40, s44 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s44, s63, 16 +; VI-NEXT: s_or_b32 s15, s15, s44 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s44, s62, 16 +; VI-NEXT: s_or_b32 s14, s14, s44 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s44, s61, 16 +; VI-NEXT: s_or_b32 s13, s13, s44 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s44, s60, 16 +; VI-NEXT: s_or_b32 s12, s12, s44 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s44, s59, 16 +; VI-NEXT: s_or_b32 s11, s11, s44 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s44, s58, 16 +; VI-NEXT: s_or_b32 s10, s10, s44 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s44, s57, 16 +; VI-NEXT: s_or_b32 s9, s9, s44 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s44, s56, 16 +; VI-NEXT: s_or_b32 s8, s8, s44 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s44, s47, 16 +; VI-NEXT: s_or_b32 s6, s6, s44 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s44, s46, 16 +; VI-NEXT: s_or_b32 s7, s7, s44 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s43 +; VI-NEXT: v_mov_b32_e32 v17, s42 +; VI-NEXT: v_mov_b32_e32 v18, s41 +; VI-NEXT: v_mov_b32_e32 v19, s40 +; VI-NEXT: v_mov_b32_e32 v20, s15 +; VI-NEXT: v_mov_b32_e32 v21, s14 +; VI-NEXT: v_mov_b32_e32 v22, s13 +; VI-NEXT: v_mov_b32_e32 v23, s12 +; VI-NEXT: v_mov_b32_e32 v24, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v26, s9 +; VI-NEXT: v_mov_b32_e32 v27, s8 +; VI-NEXT: v_mov_b32_e32 v28, s6 +; VI-NEXT: v_mov_b32_e32 v29, s7 +; VI-NEXT: v_readlane_b32 s39, v30, 7 +; VI-NEXT: v_readlane_b32 s38, v30, 6 +; VI-NEXT: v_readlane_b32 s37, v30, 5 +; VI-NEXT: v_readlane_b32 s36, v30, 4 +; VI-NEXT: v_readlane_b32 s35, v30, 3 +; VI-NEXT: v_readlane_b32 s34, v30, 2 +; VI-NEXT: v_readlane_b32 s31, v30, 1 +; VI-NEXT: v_readlane_b32 s30, v30, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB13_2 ; -; GFX9-LABEL: bitcast_v30i32_to_v60f16: +; GFX9-LABEL: bitcast_v30i32_to_v60i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v30, s30, 0 +; GFX9-NEXT: v_writelane_b32 v30, s31, 1 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_writelane_b32 v30, s34, 2 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: v_readfirstlane_b32 s42, v12 +; GFX9-NEXT: v_readfirstlane_b32 s43, v13 +; GFX9-NEXT: v_readfirstlane_b32 s44, v14 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s45, v15 +; GFX9-NEXT: v_writelane_b32 v30, s35, 3 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB8_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 -; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 -; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 -; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 -; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 -; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 -; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 -; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 -; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 -; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB8_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 -; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_lshr_b32 s47, s44, 16 +; GFX9-NEXT: s_lshr_b32 s56, s43, 16 +; GFX9-NEXT: s_lshr_b32 s57, s42, 16 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s40, 16 +; GFX9-NEXT: s_lshr_b32 s60, s15, 16 +; GFX9-NEXT: s_lshr_b32 s61, s14, 16 +; GFX9-NEXT: s_lshr_b32 s62, s13, 16 +; GFX9-NEXT: s_lshr_b32 s63, s12, 16 +; GFX9-NEXT: s_lshr_b32 s72, s11, 16 +; GFX9-NEXT: s_lshr_b32 s73, s10, 16 +; GFX9-NEXT: s_lshr_b32 s74, s9, 16 +; GFX9-NEXT: s_lshr_b32 s75, s8, 16 +; GFX9-NEXT: s_lshr_b32 s76, s7, 16 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s78, s29, 16 +; GFX9-NEXT: s_lshr_b32 s79, s28, 16 +; GFX9-NEXT: s_lshr_b32 s88, s27, 16 +; GFX9-NEXT: s_lshr_b32 s89, s26, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 16 +; GFX9-NEXT: s_lshr_b32 s92, s23, 16 +; GFX9-NEXT: s_lshr_b32 s93, s22, 16 +; GFX9-NEXT: s_lshr_b32 s94, s21, 16 +; GFX9-NEXT: s_lshr_b32 s95, s20, 16 +; GFX9-NEXT: s_lshr_b32 s30, s19, 16 +; GFX9-NEXT: s_lshr_b32 s31, s18, 16 +; GFX9-NEXT: s_lshr_b32 s34, s17, 16 +; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s45, s45, 3 +; GFX9-NEXT: s_add_i32 s44, s44, 3 +; GFX9-NEXT: s_add_i32 s43, s43, 3 +; GFX9-NEXT: s_add_i32 s42, s42, 3 +; GFX9-NEXT: s_add_i32 s41, s41, 3 +; GFX9-NEXT: s_add_i32 s40, s40, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_lshr_b32 s47, s44, 16 +; GFX9-NEXT: s_lshr_b32 s56, s43, 16 +; GFX9-NEXT: s_lshr_b32 s57, s42, 16 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s40, 16 +; GFX9-NEXT: s_lshr_b32 s60, s15, 16 +; GFX9-NEXT: s_lshr_b32 s61, s14, 16 +; GFX9-NEXT: s_lshr_b32 s62, s13, 16 +; GFX9-NEXT: s_lshr_b32 s63, s12, 16 +; GFX9-NEXT: s_lshr_b32 s72, s11, 16 +; GFX9-NEXT: s_lshr_b32 s73, s10, 16 +; GFX9-NEXT: s_lshr_b32 s74, s9, 16 +; GFX9-NEXT: s_lshr_b32 s75, s8, 16 +; GFX9-NEXT: s_lshr_b32 s76, s7, 16 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s78, s29, 16 +; GFX9-NEXT: s_lshr_b32 s79, s28, 16 +; GFX9-NEXT: s_lshr_b32 s88, s27, 16 +; GFX9-NEXT: s_lshr_b32 s89, s26, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 16 +; GFX9-NEXT: s_lshr_b32 s92, s23, 16 +; GFX9-NEXT: s_lshr_b32 s93, s22, 16 +; GFX9-NEXT: s_lshr_b32 s94, s21, 16 +; GFX9-NEXT: s_lshr_b32 s95, s20, 16 +; GFX9-NEXT: s_lshr_b32 s30, s19, 16 +; GFX9-NEXT: s_lshr_b32 s31, s18, 16 +; GFX9-NEXT: s_lshr_b32 s34, s17, 16 +; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s35 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s34 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s31 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s30 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 +; GFX9-NEXT: v_mov_b32_e32 v28, s42 +; GFX9-NEXT: v_mov_b32_e32 v29, s43 +; GFX9-NEXT: v_readlane_b32 s35, v30, 3 +; GFX9-NEXT: v_readlane_b32 s34, v30, 2 +; GFX9-NEXT: v_readlane_b32 s31, v30, 1 +; GFX9-NEXT: v_readlane_b32 s30, v30, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: s_branch .LBB13_2 ; -; GFX11-TRUE16-LABEL: bitcast_v30i32_to_v60f16: +; GFX11-TRUE16-LABEL: bitcast_v30i32_to_v60i16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v11 +; GFX11-TRUE16-NEXT: s_mov_b32 s94, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s14, s14, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB8_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s15, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s14, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s13, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s12, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v19, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s13 :: v_dual_mov_b32 v21, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s11 :: v_dual_mov_b32 v23, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s9 :: v_dual_mov_b32 v25, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v27, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s6 :: v_dual_mov_b32 v29, s4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 ; -; GFX11-FAKE16-LABEL: bitcast_v30i32_to_v60f16: +; GFX11-FAKE16-LABEL: bitcast_v30i32_to_v60i16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v11 +; GFX11-FAKE16-NEXT: s_mov_b32 s94, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s15, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s14, s14, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s15, s15, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s15, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s12, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s15, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s14, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB13_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: s_branch .LBB13_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: %a1 = add <30 x i32> %a, splat (i32 3) - %a2 = bitcast <30 x i32> %a1 to <60 x half> + %a2 = bitcast <30 x i32> %a1 to <60 x i16> br label %end cmp.false: - %a3 = bitcast <30 x i32> %a to <60 x half> + %a3 = bitcast <30 x i32> %a to <60 x i16> br label %end end: - %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <60 x half> %phi + %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x i16> %phi } -define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v60f16_to_v30i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v46 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v47 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v44 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v45 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v60 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; GCN-NEXT: v_or_b32_e32 v0, v58, v0 -; GCN-NEXT: v_or_b32_e32 v1, v56, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v50 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: v_or_b32_e32 v18, v46, v18 -; GCN-NEXT: v_or_b32_e32 v19, v44, v19 -; GCN-NEXT: v_or_b32_e32 v20, v43, v20 -; GCN-NEXT: v_or_b32_e32 v21, v41, v21 -; GCN-NEXT: v_or_b32_e32 v22, v48, v22 -; GCN-NEXT: v_or_b32_e32 v23, v38, v23 -; GCN-NEXT: v_or_b32_e32 v24, v36, v24 -; GCN-NEXT: v_or_b32_e32 v25, v35, v25 -; GCN-NEXT: v_or_b32_e32 v26, v37, v26 -; GCN-NEXT: v_or_b32_e32 v27, v39, v27 -; GCN-NEXT: v_or_b32_e32 v28, v49, v28 -; GCN-NEXT: v_or_b32_e32 v29, v51, v29 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v56 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v46 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v44 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_or_b32_e32 v17, v19, v18 -; GCN-NEXT: v_or_b32_e32 v18, v21, v20 -; GCN-NEXT: v_or_b32_e32 v19, v23, v22 -; GCN-NEXT: v_or_b32_e32 v20, v25, v24 -; GCN-NEXT: v_or_b32_e32 v21, v27, v26 -; GCN-NEXT: v_or_b32_e32 v22, v29, v28 -; GCN-NEXT: v_or_b32_e32 v23, v31, v30 -; GCN-NEXT: v_or_b32_e32 v24, v33, v32 -; GCN-NEXT: v_or_b32_e32 v25, v35, v34 -; GCN-NEXT: v_or_b32_e32 v26, v37, v36 -; GCN-NEXT: v_or_b32_e32 v27, v39, v38 -; GCN-NEXT: v_or_b32_e32 v28, v49, v48 -; GCN-NEXT: v_or_b32_e32 v29, v51, v50 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v60i16_to_v30i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v59 +; SI-NEXT: v_or_b32_e32 v6, v6, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v49 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v56 +; SI-NEXT: v_or_b32_e32 v13, v13, v47 +; SI-NEXT: v_or_b32_e32 v14, v14, v46 +; SI-NEXT: v_or_b32_e32 v15, v15, v38 +; SI-NEXT: v_or_b32_e32 v16, v16, v45 +; SI-NEXT: v_or_b32_e32 v17, v17, v44 +; SI-NEXT: v_or_b32_e32 v19, v19, v42 +; SI-NEXT: v_or_b32_e32 v20, v20, v41 +; SI-NEXT: v_or_b32_e32 v21, v21, v40 +; SI-NEXT: v_or_b32_e32 v22, v22, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_or_b32_e32 v25, v25, v34 +; SI-NEXT: v_or_b32_e32 v26, v26, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v32 +; SI-NEXT: v_or_b32_e32 v28, v28, v63 +; SI-NEXT: v_or_b32_e32 v29, v29, v62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v18, v43, v18 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 +; SI-NEXT: v_or_b32_e32 v6, v50, v6 +; SI-NEXT: v_or_b32_e32 v7, v49, v7 +; SI-NEXT: v_or_b32_e32 v8, v48, v8 +; SI-NEXT: v_or_b32_e32 v9, v58, v9 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v56, v12 +; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v14, v46, v14 +; SI-NEXT: v_or_b32_e32 v15, v38, v15 +; SI-NEXT: v_or_b32_e32 v16, v45, v16 +; SI-NEXT: v_or_b32_e32 v17, v44, v17 +; SI-NEXT: v_or_b32_e32 v19, v42, v19 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v21, v40, v21 +; SI-NEXT: v_or_b32_e32 v22, v37, v22 +; SI-NEXT: v_or_b32_e32 v23, v36, v23 +; SI-NEXT: v_or_b32_e32 v24, v35, v24 +; SI-NEXT: v_or_b32_e32 v25, v34, v25 +; SI-NEXT: v_or_b32_e32 v26, v33, v26 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: v_or_b32_e32 v28, v63, v28 +; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v60f16_to_v30i32: +; VI-LABEL: bitcast_v60i16_to_v30i32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -5590,7 +5711,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v29, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -5683,102 +5804,102 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB9_2: ; %Flow +; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_4 +; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v29, 0x200 -; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v61 -; VI-NEXT: v_add_f16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v60 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 -; VI-NEXT: v_add_f16_sdwa v2, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v59 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_add_f16_sdwa v3, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v58 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_add_f16_sdwa v4, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v57 -; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: v_add_f16_sdwa v5, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v56 -; VI-NEXT: v_or_b32_e32 v5, v6, v5 -; VI-NEXT: v_add_f16_sdwa v6, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v47 -; VI-NEXT: v_or_b32_e32 v6, v7, v6 -; VI-NEXT: v_add_f16_sdwa v7, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v46 -; VI-NEXT: v_or_b32_e32 v7, v8, v7 -; VI-NEXT: v_add_f16_sdwa v8, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v45 -; VI-NEXT: v_or_b32_e32 v8, v9, v8 -; VI-NEXT: v_add_f16_sdwa v9, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, 0x200, v44 -; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: v_add_f16_sdwa v10, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, 0x200, v43 -; VI-NEXT: v_or_b32_e32 v10, v11, v10 -; VI-NEXT: v_add_f16_sdwa v11, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, 0x200, v42 -; VI-NEXT: v_or_b32_e32 v11, v12, v11 -; VI-NEXT: v_add_f16_sdwa v12, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v13, 0x200, v41 -; VI-NEXT: v_or_b32_e32 v12, v13, v12 -; VI-NEXT: v_add_f16_sdwa v13, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, 0x200, v40 -; VI-NEXT: v_or_b32_e32 v13, v14, v13 -; VI-NEXT: v_add_f16_sdwa v14, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v15, 0x200, v55 -; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: v_add_f16_sdwa v15, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v16, 0x200, v54 -; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: v_add_f16_sdwa v16, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v17, 0x200, v53 -; VI-NEXT: v_or_b32_e32 v16, v17, v16 -; VI-NEXT: v_add_f16_sdwa v17, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v52 -; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: v_add_f16_sdwa v18, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v19, 0x200, v51 -; VI-NEXT: v_or_b32_e32 v18, v19, v18 -; VI-NEXT: v_add_f16_sdwa v19, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v50 -; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: v_add_f16_sdwa v20, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v21, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v20, v21, v20 -; VI-NEXT: v_add_f16_sdwa v21, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v22, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: v_add_f16_sdwa v22, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v23, 0x200, v39 -; VI-NEXT: v_or_b32_e32 v22, v23, v22 -; VI-NEXT: v_add_f16_sdwa v23, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v24, 0x200, v38 -; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: v_add_f16_sdwa v24, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v25, 0x200, v37 -; VI-NEXT: v_or_b32_e32 v24, v25, v24 -; VI-NEXT: v_add_f16_sdwa v25, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v26, 0x200, v36 -; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: v_add_f16_sdwa v26, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v27, 0x200, v35 -; VI-NEXT: v_or_b32_e32 v26, v27, v26 -; VI-NEXT: v_add_f16_sdwa v27, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v28, 0x200, v34 -; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: v_add_f16_sdwa v28, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v30, 0x200, v33 -; VI-NEXT: v_or_b32_e32 v28, v30, v28 -; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 +; VI-NEXT: v_mov_b32_e32 v29, 3 +; VI-NEXT: v_add_u16_e32 v0, 3, v61 +; VI-NEXT: v_add_u16_sdwa v1, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 3, v60 +; VI-NEXT: v_add_u16_sdwa v3, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v59 +; VI-NEXT: v_add_u16_sdwa v3, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v58 +; VI-NEXT: v_add_u16_sdwa v4, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v57 +; VI-NEXT: v_add_u16_sdwa v5, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v56 +; VI-NEXT: v_add_u16_sdwa v6, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v47 +; VI-NEXT: v_add_u16_sdwa v7, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u16_e32 v7, 3, v46 +; VI-NEXT: v_add_u16_sdwa v8, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v45 +; VI-NEXT: v_add_u16_sdwa v9, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v44 +; VI-NEXT: v_add_u16_sdwa v10, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v43 +; VI-NEXT: v_add_u16_sdwa v11, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v42 +; VI-NEXT: v_add_u16_sdwa v12, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v41 +; VI-NEXT: v_add_u16_sdwa v13, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v40 +; VI-NEXT: v_add_u16_sdwa v14, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v55 +; VI-NEXT: v_add_u16_sdwa v15, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: v_add_u16_e32 v15, 3, v54 +; VI-NEXT: v_add_u16_sdwa v16, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: v_add_u16_e32 v16, 3, v53 +; VI-NEXT: v_add_u16_sdwa v17, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: v_add_u16_e32 v17, 3, v52 +; VI-NEXT: v_add_u16_sdwa v18, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_add_u16_e32 v18, 3, v51 +; VI-NEXT: v_add_u16_sdwa v19, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: v_add_u16_e32 v19, 3, v50 +; VI-NEXT: v_add_u16_sdwa v20, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_add_u16_e32 v20, 3, v49 +; VI-NEXT: v_add_u16_sdwa v21, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: v_add_u16_e32 v21, 3, v48 +; VI-NEXT: v_add_u16_sdwa v22, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v21, v22 +; VI-NEXT: v_add_u16_e32 v22, 3, v39 +; VI-NEXT: v_add_u16_sdwa v23, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v22, v23 +; VI-NEXT: v_add_u16_e32 v23, 3, v38 +; VI-NEXT: v_add_u16_sdwa v24, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v23, v24 +; VI-NEXT: v_add_u16_e32 v24, 3, v37 +; VI-NEXT: v_add_u16_sdwa v25, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v24, v25 +; VI-NEXT: v_add_u16_e32 v25, 3, v36 +; VI-NEXT: v_add_u16_sdwa v26, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v25, v26 +; VI-NEXT: v_add_u16_e32 v26, 3, v35 +; VI-NEXT: v_add_u16_sdwa v27, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v26, v27 +; VI-NEXT: v_add_u16_e32 v27, 3, v34 +; VI-NEXT: v_add_u16_sdwa v28, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v27, v28 +; VI-NEXT: v_add_u16_e32 v28, 3, v33 +; VI-NEXT: v_add_u16_sdwa v30, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v28, v30 +; VI-NEXT: v_add_u16_e32 v30, 3, v32 +; VI-NEXT: v_add_u16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v29, v30, v29 -; VI-NEXT: .LBB9_4: ; %end +; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5797,7 +5918,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v60f16_to_v30i32: +; GFX9-LABEL: bitcast_v60i16_to_v30i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -5901,7 +6022,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -6057,9 +6178,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: .LBB9_2: ; %Flow +; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload @@ -6079,7 +6200,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 @@ -6088,15 +6208,15 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -6129,50 +6249,50 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v28, v28, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: v_pk_add_f16 v29, v29, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_4: ; %end +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB14_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -6193,7 +6313,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30i32: +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30i32: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -6201,43 +6321,43 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30i32: +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30i32: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 @@ -6305,51 +6425,51 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <60 x half> %a, splat (half 0xH0200) - %a2 = bitcast <60 x half> %a1 to <30 x i32> + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <30 x i32> br label %end cmp.false: - %a3 = bitcast <60 x half> %a to <30 x i32> + %a3 = bitcast <60 x i16> %a to <30 x i32> br label %end end: @@ -6357,1029 +6477,4967 @@ end: ret <30 x i32> %phi } -define <15 x i64> @bitcast_v30f32_to_v15i64(<30 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v30f32_to_v15i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60i16_to_v30i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v28 +; SI-NEXT: v_mov_b32_e32 v33, v26 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v16 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v12, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v17, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v18, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v19, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_or_b32_e32 v20, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v58 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v28 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_or_b32_e32 v22, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v23, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v24, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v25, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v26, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v27, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v28, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v29, v0, v63 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v44 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v58 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v57, v56 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v62, v60 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: v_mov_b32_e32 v31, v61 +; SI-NEXT: v_mov_b32_e32 v61, v63 +; SI-NEXT: v_mov_b32_e32 v63, v60 +; SI-NEXT: v_mov_b32_e32 v60, v62 +; SI-NEXT: v_mov_b32_e32 v62, v47 +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: v_mov_b32_e32 v56, v57 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v58, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: s_branch .LBB15_2 ; -; VI-LABEL: bitcast_v30f32_to_v15i64: +; VI-LABEL: bitcast_v60i16_to_v30i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v39, v15 +; VI-NEXT: v_mov_b32_e32 v37, v14 +; VI-NEXT: v_mov_b32_e32 v35, v13 +; VI-NEXT: v_mov_b32_e32 v34, v12 +; VI-NEXT: v_mov_b32_e32 v33, v11 +; VI-NEXT: v_mov_b32_e32 v32, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v38, v8 +; VI-NEXT: v_mov_b32_e32 v48, v7 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v52, v3 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v55, v1 +; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: v_or_b32_sdwa v14, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_or_b32_sdwa v22, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v24, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v25, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v26, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v54 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB15_2 ; -; GFX9-LABEL: bitcast_v30f32_to_v15i64: +; GFX9-LABEL: bitcast_v60i16_to_v30i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v30f32_to_v15i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 -; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB10_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: v_mov_b32_e32 v32, v15 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v12 +; GFX9-NEXT: v_mov_b32_e32 v36, v11 +; GFX9-NEXT: v_mov_b32_e32 v37, v10 +; GFX9-NEXT: v_mov_b32_e32 v38, v9 +; GFX9-NEXT: v_mov_b32_e32 v39, v8 +; GFX9-NEXT: v_mov_b32_e32 v48, v7 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v50, v5 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v52, v3 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v54, v1 +; GFX9-NEXT: v_mov_b32_e32 v55, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v63, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v33, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v9 :: v_dual_mov_b32 v35, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v7 :: v_dual_mov_b32 v37, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v5 :: v_dual_mov_b32 v39, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v3 :: v_dual_mov_b32 v49, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v1 :: v_dual_mov_b32 v51, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v28, 16, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v34, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <30 x float> %a1 to <15 x i64> + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <30 x i32> br label %end cmp.false: - %a3 = bitcast <30 x float> %a to <15 x i64> + %a3 = bitcast <60 x i16> %a to <30 x i32> br label %end end: - %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <15 x i64> %phi + %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x i32> %phi } -define <30 x float> @bitcast_v15i64_to_v30f32(<15 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v15i64_to_v30f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v30i32_to_v60f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: v_mov_b32_e32 v34, v29 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: v_mov_b32_e32 v63, v25 +; SI-NEXT: v_mov_b32_e32 v59, v26 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v15i64_to_v30f32: +; VI-LABEL: bitcast_v30i32_to_v60f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB16_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB16_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB16_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v15i64_to_v30f32: +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v30i32_to_v60f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB16_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 -; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB11_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v15i64_to_v30f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB11_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <15 x i64> %a, splat (i64 3) - %a2 = bitcast <15 x i64> %a1 to <30 x float> - br label %end - -cmp.false: - %a3 = bitcast <15 x i64> %a to <30 x float> - br label %end - -end: - %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <30 x float> %phi -} - -define <15 x double> @bitcast_v30f32_to_v15f64(<30 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v30f32_to_v15f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v30f32_to_v15f64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v30f32_to_v15f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB16_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 +; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v30f32_to_v15f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 -; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB12_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <30 x float> %a1 to <15 x double> - br label %end - -cmp.false: - %a3 = bitcast <30 x float> %a to <15 x double> - br label %end - -end: - %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <15 x double> %phi -} - -define <30 x float> @bitcast_v15f64_to_v30f32(<15 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v15f64_to_v30f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v15f64_to_v30f32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v30i32_to_v60f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v15f64_to_v30f32: +; GFX11-FAKE16-LABEL: bitcast_v30i32_to_v60f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <30 x i32> %a, splat (i32 3) + %a2 = bitcast <30 x i32> %a1 to <60 x half> + br label %end + +cmp.false: + %a3 = bitcast <30 x i32> %a to <60 x half> + br label %end + +end: + %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x half> %phi +} + +define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30i32_to_v60f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_readfirstlane_b32 s45, v1 +; SI-NEXT: v_readfirstlane_b32 s44, v2 +; SI-NEXT: v_readfirstlane_b32 s43, v3 +; SI-NEXT: v_readfirstlane_b32 s42, v4 +; SI-NEXT: v_readfirstlane_b32 s41, v5 +; SI-NEXT: v_readfirstlane_b32 s40, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v7 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s13, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s11, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s7, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s44, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s45, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s46, s18, 16 +; SI-NEXT: s_lshr_b32 s47, s19, 16 +; SI-NEXT: s_lshr_b32 s56, s20, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 16 +; SI-NEXT: s_lshr_b32 s58, s22, 16 +; SI-NEXT: s_lshr_b32 s59, s23, 16 +; SI-NEXT: s_lshr_b32 s60, s24, 16 +; SI-NEXT: s_lshr_b32 s61, s25, 16 +; SI-NEXT: s_lshr_b32 s62, s26, 16 +; SI-NEXT: s_lshr_b32 s63, s27, 16 +; SI-NEXT: s_lshr_b32 s72, s28, 16 +; SI-NEXT: s_lshr_b32 s73, s29, 16 +; SI-NEXT: s_lshr_b32 s74, s45, 16 +; SI-NEXT: s_lshr_b32 s75, s44, 16 +; SI-NEXT: s_lshr_b32 s76, s43, 16 +; SI-NEXT: s_lshr_b32 s77, s42, 16 +; SI-NEXT: s_lshr_b32 s78, s41, 16 +; SI-NEXT: s_lshr_b32 s79, s40, 16 +; SI-NEXT: s_lshr_b32 s88, s15, 16 +; SI-NEXT: s_lshr_b32 s89, s14, 16 +; SI-NEXT: s_lshr_b32 s90, s13, 16 +; SI-NEXT: s_lshr_b32 s91, s12, 16 +; SI-NEXT: s_lshr_b32 s92, s11, 16 +; SI-NEXT: s_lshr_b32 s93, s10, 16 +; SI-NEXT: s_lshr_b32 s94, s8, 16 +; SI-NEXT: s_lshr_b32 s95, s7, 16 +; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16 +; SI-NEXT: s_lshr_b32 vcc_hi, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_hi +; SI-NEXT: v_cvt_f32_f16_e32 v2, vcc_lo +; SI-NEXT: v_cvt_f32_f16_e32 v3, s95 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_or_b32_e32 v59, v59, v60 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: buffer_store_dword v59, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v59, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v57, v57, v58 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: buffer_store_dword v57, v59, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v57, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v47, v47, v56 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v47, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v45, v45, v46 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: buffer_store_dword v45, v47, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v45, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v43, v44, v43 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v43, v45, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v43, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v41, v42, v41 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v41, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v55, v40, v55 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v53, v54, v53 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v51, v52, v51 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v49, v50, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v36, v38, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v37 +; SI-NEXT: v_add_i32_e32 v37, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_or_b32_e32 v34, v36, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: v_add_i32_e32 v35, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v32, v34, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v22, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_store_dword v17, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v15, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v30i32_to_v60f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v30, s30, 0 +; VI-NEXT: v_writelane_b32 v30, s31, 1 +; VI-NEXT: v_writelane_b32 v30, s34, 2 +; VI-NEXT: v_writelane_b32 v30, s35, 3 +; VI-NEXT: v_writelane_b32 v30, s36, 4 +; VI-NEXT: v_writelane_b32 v30, s37, 5 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_writelane_b32 v30, s38, 6 +; VI-NEXT: v_readfirstlane_b32 s45, v0 +; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: v_readfirstlane_b32 s43, v2 +; VI-NEXT: v_readfirstlane_b32 s42, v3 +; VI-NEXT: v_readfirstlane_b32 s41, v4 +; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s15, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s13, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s11, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s9, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s6, v14 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v15 +; VI-NEXT: v_writelane_b32 v30, s39, 7 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s57, s9, 16 +; VI-NEXT: s_lshr_b32 s58, s10, 16 +; VI-NEXT: s_lshr_b32 s59, s11, 16 +; VI-NEXT: s_lshr_b32 s60, s12, 16 +; VI-NEXT: s_lshr_b32 s61, s13, 16 +; VI-NEXT: s_lshr_b32 s62, s14, 16 +; VI-NEXT: s_lshr_b32 s63, s15, 16 +; VI-NEXT: s_lshr_b32 s72, s40, 16 +; VI-NEXT: s_lshr_b32 s73, s41, 16 +; VI-NEXT: s_lshr_b32 s74, s42, 16 +; VI-NEXT: s_lshr_b32 s75, s43, 16 +; VI-NEXT: s_lshr_b32 s76, s44, 16 +; VI-NEXT: s_lshr_b32 s77, s45, 16 +; VI-NEXT: s_lshr_b32 s78, s29, 16 +; VI-NEXT: s_lshr_b32 s79, s28, 16 +; VI-NEXT: s_lshr_b32 s88, s27, 16 +; VI-NEXT: s_lshr_b32 s89, s26, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 16 +; VI-NEXT: s_lshr_b32 s30, s23, 16 +; VI-NEXT: s_lshr_b32 s31, s22, 16 +; VI-NEXT: s_lshr_b32 s34, s21, 16 +; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s37, s18, 16 +; VI-NEXT: s_lshr_b32 s38, s17, 16 +; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s57, s9, 16 +; VI-NEXT: s_lshr_b32 s58, s10, 16 +; VI-NEXT: s_lshr_b32 s59, s11, 16 +; VI-NEXT: s_lshr_b32 s60, s12, 16 +; VI-NEXT: s_lshr_b32 s61, s13, 16 +; VI-NEXT: s_lshr_b32 s62, s14, 16 +; VI-NEXT: s_lshr_b32 s63, s15, 16 +; VI-NEXT: s_lshr_b32 s72, s40, 16 +; VI-NEXT: s_lshr_b32 s73, s41, 16 +; VI-NEXT: s_lshr_b32 s74, s42, 16 +; VI-NEXT: s_lshr_b32 s75, s43, 16 +; VI-NEXT: s_lshr_b32 s76, s44, 16 +; VI-NEXT: s_lshr_b32 s77, s45, 16 +; VI-NEXT: s_lshr_b32 s78, s29, 16 +; VI-NEXT: s_lshr_b32 s79, s28, 16 +; VI-NEXT: s_lshr_b32 s88, s27, 16 +; VI-NEXT: s_lshr_b32 s89, s26, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 16 +; VI-NEXT: s_lshr_b32 s30, s23, 16 +; VI-NEXT: s_lshr_b32 s31, s22, 16 +; VI-NEXT: s_lshr_b32 s34, s21, 16 +; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s37, s18, 16 +; VI-NEXT: s_lshr_b32 s38, s17, 16 +; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s39, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s38, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s37, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s36, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s35, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s34, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s31, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s30, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s91, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s90, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s89, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s88, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s79, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s78, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s45 +; VI-NEXT: s_lshl_b32 s29, s77, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s44 +; VI-NEXT: s_lshl_b32 s44, s76, 16 +; VI-NEXT: s_or_b32 s29, s29, s44 +; VI-NEXT: s_and_b32 s43, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s44, s75, 16 +; VI-NEXT: s_or_b32 s43, s43, s44 +; VI-NEXT: s_and_b32 s42, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s44, s74, 16 +; VI-NEXT: s_or_b32 s42, s42, s44 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s44, s73, 16 +; VI-NEXT: s_or_b32 s41, s41, s44 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s44, s72, 16 +; VI-NEXT: s_or_b32 s40, s40, s44 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s44, s63, 16 +; VI-NEXT: s_or_b32 s15, s15, s44 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s44, s62, 16 +; VI-NEXT: s_or_b32 s14, s14, s44 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s44, s61, 16 +; VI-NEXT: s_or_b32 s13, s13, s44 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s44, s60, 16 +; VI-NEXT: s_or_b32 s12, s12, s44 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s44, s59, 16 +; VI-NEXT: s_or_b32 s11, s11, s44 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s44, s58, 16 +; VI-NEXT: s_or_b32 s10, s10, s44 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s44, s57, 16 +; VI-NEXT: s_or_b32 s9, s9, s44 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s44, s56, 16 +; VI-NEXT: s_or_b32 s8, s8, s44 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s44, s47, 16 +; VI-NEXT: s_or_b32 s6, s6, s44 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s44, s46, 16 +; VI-NEXT: s_or_b32 s7, s7, s44 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s43 +; VI-NEXT: v_mov_b32_e32 v17, s42 +; VI-NEXT: v_mov_b32_e32 v18, s41 +; VI-NEXT: v_mov_b32_e32 v19, s40 +; VI-NEXT: v_mov_b32_e32 v20, s15 +; VI-NEXT: v_mov_b32_e32 v21, s14 +; VI-NEXT: v_mov_b32_e32 v22, s13 +; VI-NEXT: v_mov_b32_e32 v23, s12 +; VI-NEXT: v_mov_b32_e32 v24, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v26, s9 +; VI-NEXT: v_mov_b32_e32 v27, s8 +; VI-NEXT: v_mov_b32_e32 v28, s6 +; VI-NEXT: v_mov_b32_e32 v29, s7 +; VI-NEXT: v_readlane_b32 s39, v30, 7 +; VI-NEXT: v_readlane_b32 s38, v30, 6 +; VI-NEXT: v_readlane_b32 s37, v30, 5 +; VI-NEXT: v_readlane_b32 s36, v30, 4 +; VI-NEXT: v_readlane_b32 s35, v30, 3 +; VI-NEXT: v_readlane_b32 s34, v30, 2 +; VI-NEXT: v_readlane_b32 s31, v30, 1 +; VI-NEXT: v_readlane_b32 s30, v30, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v30i32_to_v60f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v30, s30, 0 +; GFX9-NEXT: v_writelane_b32 v30, s31, 1 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_writelane_b32 v30, s34, 2 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: v_readfirstlane_b32 s42, v12 +; GFX9-NEXT: v_readfirstlane_b32 s43, v13 +; GFX9-NEXT: v_readfirstlane_b32 s44, v14 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s45, v15 +; GFX9-NEXT: v_writelane_b32 v30, s35, 3 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_lshr_b32 s47, s44, 16 +; GFX9-NEXT: s_lshr_b32 s56, s43, 16 +; GFX9-NEXT: s_lshr_b32 s57, s42, 16 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s40, 16 +; GFX9-NEXT: s_lshr_b32 s60, s15, 16 +; GFX9-NEXT: s_lshr_b32 s61, s14, 16 +; GFX9-NEXT: s_lshr_b32 s62, s13, 16 +; GFX9-NEXT: s_lshr_b32 s63, s12, 16 +; GFX9-NEXT: s_lshr_b32 s72, s11, 16 +; GFX9-NEXT: s_lshr_b32 s73, s10, 16 +; GFX9-NEXT: s_lshr_b32 s74, s9, 16 +; GFX9-NEXT: s_lshr_b32 s75, s8, 16 +; GFX9-NEXT: s_lshr_b32 s76, s7, 16 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s78, s29, 16 +; GFX9-NEXT: s_lshr_b32 s79, s28, 16 +; GFX9-NEXT: s_lshr_b32 s88, s27, 16 +; GFX9-NEXT: s_lshr_b32 s89, s26, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 16 +; GFX9-NEXT: s_lshr_b32 s92, s23, 16 +; GFX9-NEXT: s_lshr_b32 s93, s22, 16 +; GFX9-NEXT: s_lshr_b32 s94, s21, 16 +; GFX9-NEXT: s_lshr_b32 s95, s20, 16 +; GFX9-NEXT: s_lshr_b32 s30, s19, 16 +; GFX9-NEXT: s_lshr_b32 s31, s18, 16 +; GFX9-NEXT: s_lshr_b32 s34, s17, 16 +; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s45, s45, 3 +; GFX9-NEXT: s_add_i32 s44, s44, 3 +; GFX9-NEXT: s_add_i32 s43, s43, 3 +; GFX9-NEXT: s_add_i32 s42, s42, 3 +; GFX9-NEXT: s_add_i32 s41, s41, 3 +; GFX9-NEXT: s_add_i32 s40, s40, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_lshr_b32 s47, s44, 16 +; GFX9-NEXT: s_lshr_b32 s56, s43, 16 +; GFX9-NEXT: s_lshr_b32 s57, s42, 16 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s40, 16 +; GFX9-NEXT: s_lshr_b32 s60, s15, 16 +; GFX9-NEXT: s_lshr_b32 s61, s14, 16 +; GFX9-NEXT: s_lshr_b32 s62, s13, 16 +; GFX9-NEXT: s_lshr_b32 s63, s12, 16 +; GFX9-NEXT: s_lshr_b32 s72, s11, 16 +; GFX9-NEXT: s_lshr_b32 s73, s10, 16 +; GFX9-NEXT: s_lshr_b32 s74, s9, 16 +; GFX9-NEXT: s_lshr_b32 s75, s8, 16 +; GFX9-NEXT: s_lshr_b32 s76, s7, 16 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s78, s29, 16 +; GFX9-NEXT: s_lshr_b32 s79, s28, 16 +; GFX9-NEXT: s_lshr_b32 s88, s27, 16 +; GFX9-NEXT: s_lshr_b32 s89, s26, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 16 +; GFX9-NEXT: s_lshr_b32 s92, s23, 16 +; GFX9-NEXT: s_lshr_b32 s93, s22, 16 +; GFX9-NEXT: s_lshr_b32 s94, s21, 16 +; GFX9-NEXT: s_lshr_b32 s95, s20, 16 +; GFX9-NEXT: s_lshr_b32 s30, s19, 16 +; GFX9-NEXT: s_lshr_b32 s31, s18, 16 +; GFX9-NEXT: s_lshr_b32 s34, s17, 16 +; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s35 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s34 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s31 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s30 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 +; GFX9-NEXT: v_mov_b32_e32 v28, s42 +; GFX9-NEXT: v_mov_b32_e32 v29, s43 +; GFX9-NEXT: v_readlane_b32 s35, v30, 3 +; GFX9-NEXT: v_readlane_b32 s34, v30, 2 +; GFX9-NEXT: v_readlane_b32 s31, v30, 1 +; GFX9-NEXT: v_readlane_b32 s30, v30, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: s_branch .LBB17_2 ; -; GFX11-LABEL: bitcast_v15f64_to_v30f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v30i32_to_v60f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v11 +; GFX11-TRUE16-NEXT: s_mov_b32 s94, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s14, s14, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-TRUE16-NEXT: .LBB17_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s15, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s14, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s13, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s12, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v19, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s13 :: v_dual_mov_b32 v21, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s11 :: v_dual_mov_b32 v23, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s9 :: v_dual_mov_b32 v25, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v27, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s6 :: v_dual_mov_b32 v29, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB17_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB17_2 +; +; GFX11-FAKE16-LABEL: bitcast_v30i32_to_v60f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v11 +; GFX11-FAKE16-NEXT: s_mov_b32 s94, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s15, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-FAKE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s14, s14, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s15, s15, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s15, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-FAKE16-NEXT: .LBB17_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s12, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s15, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s14, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB17_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: s_branch .LBB17_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <15 x double> %a1 to <30 x float> + %a1 = add <30 x i32> %a, splat (i32 3) + %a2 = bitcast <30 x i32> %a1 to <60 x half> br label %end cmp.false: - %a3 = bitcast <15 x double> %a to <30 x float> + %a3 = bitcast <30 x i32> %a to <60 x half> br label %end end: - %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <30 x float> %phi + %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x half> %phi } -define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v30f32_to_v60i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v38, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v48, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v51, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v53, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v44, v6, v5, 16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_alignbit_b32 v46, v4, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v38, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v48, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v51, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v53, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v44, v6, v5, 16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_alignbit_b32 v46, v4, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v56 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v60 -; GCN-NEXT: v_or_b32_e32 v2, v2, v56 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v3, v3, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; GCN-NEXT: v_or_b32_e32 v4, v4, v59 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v5, v5, v44 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v6, v6, v58 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v7, v7, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; GCN-NEXT: v_or_b32_e32 v8, v8, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v9, v9, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_or_b32_e32 v10, v10, v47 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v11, v11, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_or_b32_e32 v12, v12, v45 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v13, v13, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v14, v14, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v15, v15, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v16, v16, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v17, v17, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v18, v18, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v19, v19, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v20, v20, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v21, v21, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v22, v22, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v24, v24, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v25, v25, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v26, v26, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v27, v27, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v28, v28, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v29, v29, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v30, v30, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v60f16_to_v30i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v23, v52, v23 +; SI-NEXT: v_or_b32_e32 v24, v50, v24 +; SI-NEXT: v_or_b32_e32 v25, v48, v25 +; SI-NEXT: v_or_b32_e32 v26, v38, v26 +; SI-NEXT: v_or_b32_e32 v27, v36, v27 +; SI-NEXT: v_or_b32_e32 v28, v34, v28 +; SI-NEXT: v_or_b32_e32 v29, v32, v29 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 +; SI-NEXT: v_or_b32_e32 v22, v62, v22 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v63 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v30f32_to_v60i16: +; VI-LABEL: bitcast_v60f16_to_v30i32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v29 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v34, v27 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v36, v25 +; VI-NEXT: v_mov_b32_e32 v37, v24 +; VI-NEXT: v_mov_b32_e32 v38, v23 +; VI-NEXT: v_mov_b32_e32 v39, v22 +; VI-NEXT: v_mov_b32_e32 v48, v21 +; VI-NEXT: v_mov_b32_e32 v49, v20 +; VI-NEXT: v_mov_b32_e32 v50, v19 +; VI-NEXT: v_mov_b32_e32 v51, v18 +; VI-NEXT: v_mov_b32_e32 v52, v17 +; VI-NEXT: v_mov_b32_e32 v53, v16 +; VI-NEXT: v_mov_b32_e32 v54, v15 +; VI-NEXT: v_mov_b32_e32 v55, v14 +; VI-NEXT: v_mov_b32_e32 v40, v13 +; VI-NEXT: v_mov_b32_e32 v41, v12 +; VI-NEXT: v_mov_b32_e32 v42, v11 +; VI-NEXT: v_mov_b32_e32 v43, v10 +; VI-NEXT: v_mov_b32_e32 v44, v9 +; VI-NEXT: v_mov_b32_e32 v45, v8 +; VI-NEXT: v_mov_b32_e32 v46, v7 +; VI-NEXT: v_mov_b32_e32 v47, v6 +; VI-NEXT: v_mov_b32_e32 v56, v5 +; VI-NEXT: v_mov_b32_e32 v57, v4 +; VI-NEXT: v_mov_b32_e32 v58, v3 +; VI-NEXT: v_mov_b32_e32 v59, v2 +; VI-NEXT: v_mov_b32_e32 v60, v1 +; VI-NEXT: v_mov_b32_e32 v61, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v29, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 @@ -7408,244 +11466,4613 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB14_2: ; %Flow +; VI-NEXT: .LBB18_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_4 +; VI-NEXT: s_cbranch_execz .LBB18_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB14_4: ; %end +; VI-NEXT: v_mov_b32_e32 v29, 0x200 +; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v61 +; VI-NEXT: v_add_f16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v60 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v59 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_f16_sdwa v3, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v58 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_add_f16_sdwa v4, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v57 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_add_f16_sdwa v5, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v56 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_add_f16_sdwa v6, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v47 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_add_f16_sdwa v7, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v46 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_add_f16_sdwa v8, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v45 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_add_f16_sdwa v9, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v44 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_f16_sdwa v10, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v43 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_add_f16_sdwa v11, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v42 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_add_f16_sdwa v12, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v41 +; VI-NEXT: v_or_b32_e32 v12, v13, v12 +; VI-NEXT: v_add_f16_sdwa v13, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v40 +; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_add_f16_sdwa v14, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v55 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v54 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v26, v27, v26 +; VI-NEXT: v_add_f16_sdwa v27, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: v_add_f16_sdwa v28, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v28, v30, v28 +; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v29, v30, v29 +; VI-NEXT: .LBB18_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v30f32_to_v60i16: +; GFX9-LABEL: bitcast_v60f16_to_v30i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v48, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: v_mov_b32_e32 v43, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_mov_b32_e32 v44, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v8 +; GFX9-NEXT: v_mov_b32_e32 v46, v7 +; GFX9-NEXT: v_mov_b32_e32 v47, v6 +; GFX9-NEXT: v_mov_b32_e32 v56, v5 +; GFX9-NEXT: v_mov_b32_e32 v57, v4 +; GFX9-NEXT: v_mov_b32_e32 v58, v3 +; GFX9-NEXT: v_mov_b32_e32 v59, v2 +; GFX9-NEXT: v_mov_b32_e32 v60, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: .LBB18_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB18_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: s_movk_i32 s7, 0x200 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v28, v28, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 +; GFX9-NEXT: v_pk_add_f16 v29, v29, s7 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB18_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB18_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB18_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <60 x half> %a, splat (half 0xH0200) + %a2 = bitcast <60 x half> %a1 to <30 x i32> + br label %end + +cmp.false: + %a3 = bitcast <60 x half> %a to <30 x i32> + br label %end + +end: + %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x i32> %phi +} + +define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60f16_to_v30i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB19_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_or_b32_e32 v10, v32, v10 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v13, v43, v13 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_mov_b32_e32 v57, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v47, v36 +; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_or_b32_e32 v11, v35, v11 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_or_b32_e32 v14, v55, v14 +; SI-NEXT: v_or_b32_e32 v15, v61, v15 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v21, v51, v21 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v17, v32, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_or_b32_e32 v9, v39, v9 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v54, v29 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: s_branch .LBB19_3 +; SI-NEXT: .LBB19_2: +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v47, v36 +; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v30, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v39 +; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v31, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: .LBB19_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v61, v40 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: s_cbranch_vccnz .LBB19_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_mov_b32_e32 v55, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: .LBB19_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v60f16_to_v30i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v34, v13 +; VI-NEXT: v_mov_b32_e32 v35, v12 +; VI-NEXT: v_mov_b32_e32 v36, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v38, v9 +; VI-NEXT: v_mov_b32_e32 v39, v8 +; VI-NEXT: v_mov_b32_e32 v48, v7 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v52, v3 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v54, v1 +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v29, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v55 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v54 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v26, v27, v26 +; VI-NEXT: v_add_f16_sdwa v27, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: v_add_f16_sdwa v28, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v28, v30, v28 +; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v29, v30, v29 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v60f16_to_v30i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v15 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v12 +; GFX9-NEXT: v_mov_b32_e32 v36, v11 +; GFX9-NEXT: v_mov_b32_e32 v37, v10 +; GFX9-NEXT: v_mov_b32_e32 v38, v9 +; GFX9-NEXT: v_mov_b32_e32 v39, v8 +; GFX9-NEXT: v_mov_b32_e32 v48, v7 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v50, v5 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v52, v3 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v54, v1 +; GFX9-NEXT: v_mov_b32_e32 v55, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v33, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v9 :: v_dual_mov_b32 v35, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v7 :: v_dual_mov_b32 v37, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v5 :: v_dual_mov_b32 v39, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v3 :: v_dual_mov_b32 v49, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v1 :: v_dual_mov_b32 v51, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v28, 16, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v34, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB19_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB19_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB19_2 +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB19_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB19_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB19_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <60 x half> %a, splat (half 0xH0200) + %a2 = bitcast <60 x half> %a1 to <30 x i32> + br label %end + +cmp.false: + %a3 = bitcast <60 x half> %a to <30 x i32> + br label %end + +end: + %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x i32> %phi +} + +define <15 x i64> @bitcast_v30f32_to_v15i64(<30 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v30f32_to_v15i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v30f32_to_v15i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v30f32_to_v15i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v30f32_to_v15i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <30 x float> %a1 to <15 x i64> + br label %end + +cmp.false: + %a3 = bitcast <30 x float> %a to <15 x i64> + br label %end + +end: + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi +} + +define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30f32_to_v15i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v30f32_to_v15i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v30f32_to_v15i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v30f32_to_v15i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB21_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: .LBB21_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <30 x float> %a1 to <15 x i64> + br label %end + +cmp.false: + %a3 = bitcast <30 x float> %a to <15 x i64> + br label %end + +end: + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi +} + +define <30 x float> @bitcast_v15i64_to_v30f32(<15 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v15i64_to_v30f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15i64_to_v30f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v15i64_to_v30f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v15i64_to_v30f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <15 x i64> %a, splat (i64 3) + %a2 = bitcast <15 x i64> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <15 x i64> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + +define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15i64_to_v30f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v15i64_to_v30f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v15i64_to_v30f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v15i64_to_v30f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB23_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: .LBB23_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <15 x i64> %a, splat (i64 3) + %a2 = bitcast <15 x i64> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <15 x i64> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + +define <15 x double> @bitcast_v30f32_to_v15f64(<30 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v30f32_to_v15f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v30f32_to_v15f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v30f32_to_v15f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v30f32_to_v15f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <30 x float> %a1 to <15 x double> + br label %end + +cmp.false: + %a3 = bitcast <30 x float> %a to <15 x double> + br label %end + +end: + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi +} + +define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30f32_to_v15f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: s_branch .LBB25_2 +; +; VI-LABEL: bitcast_v30f32_to_v15f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v30f32_to_v15f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-LABEL: bitcast_v30f32_to_v15f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB25_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <30 x float> %a1 to <15 x double> + br label %end + +cmp.false: + %a3 = bitcast <30 x float> %a to <15 x double> + br label %end + +end: + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi +} + +define <30 x float> @bitcast_v15f64_to_v30f32(<15 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v15f64_to_v30f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15f64_to_v30f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v15f64_to_v30f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v15f64_to_v30f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <15 x double> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + +define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15f64_to_v30f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v31, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: v_mov_b32_e32 v17, v31 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v15f64_to_v30f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v31, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: v_mov_b32_e32 v17, v31 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v15f64_to_v30f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: v_mov_b32_e32 v17, v31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v15f64_to_v30f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB27_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: .LBB27_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <15 x double> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + +define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v30f32_to_v60i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v48, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v51, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v53, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v44, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v48, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v51, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v53, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v44, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v56 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v30f32_to_v60i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB28_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB28_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB28_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v30f32_to_v60i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 @@ -7665,9 +16092,9 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB14_2: ; %Flow +; GFX9-NEXT: .LBB28_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: s_cbranch_execz .LBB28_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 ; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 @@ -7729,7 +16156,7 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB14_4: ; %end +; GFX9-NEXT: .LBB28_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 @@ -7785,7 +16212,7 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 @@ -7802,7 +16229,7 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7843,7 +16270,7 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 @@ -7875,9 +16302,9 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB28_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 @@ -7924,7 +16351,7 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: .LBB28_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 @@ -7975,566 +16402,1819 @@ end: ret <60 x i16> %phi } +define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30f32_to_v60i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_mov_b32_e32 v28, s17 +; SI-NEXT: v_mov_b32_e32 v33, s18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v32, s19 +; SI-NEXT: v_mov_b32_e32 v29, s20 +; SI-NEXT: v_mov_b32_e32 v27, s21 +; SI-NEXT: v_mov_b32_e32 v25, s22 +; SI-NEXT: v_mov_b32_e32 v24, s23 +; SI-NEXT: v_mov_b32_e32 v23, s24 +; SI-NEXT: v_mov_b32_e32 v21, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v17, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v22, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v26, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v34, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v38, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v48, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v51, v17, v18, 16 +; SI-NEXT: v_alignbit_b32 v53, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v55, v21, v23, 16 +; SI-NEXT: v_alignbit_b32 v41, v24, v25, 16 +; SI-NEXT: v_alignbit_b32 v44, v27, v29, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v32, v33, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v28, v30, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v21 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v27 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_alignbit_b32 v22, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v26, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v34, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v38, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v48, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v51, v17, v18, 16 +; SI-NEXT: v_alignbit_b32 v53, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v55, v21, v23, 16 +; SI-NEXT: v_alignbit_b32 v41, v24, v25, 16 +; SI-NEXT: v_alignbit_b32 v44, v27, v29, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v32, v33, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v28, v30, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v21 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v27 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_or_b32_e32 v30, v30, v56 +; SI-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v60 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v46 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v59 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v44 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v58 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v45 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v43 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_v30f32_to_v60i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v19, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v28, s19 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v29, s20 +; VI-NEXT: v_mov_b32_e32 v27, s21 +; VI-NEXT: v_mov_b32_e32 v26, s22 +; VI-NEXT: v_mov_b32_e32 v25, s23 +; VI-NEXT: v_mov_b32_e32 v24, s24 +; VI-NEXT: v_mov_b32_e32 v23, s25 +; VI-NEXT: v_mov_b32_e32 v22, s26 +; VI-NEXT: v_mov_b32_e32 v21, s27 +; VI-NEXT: v_mov_b32_e32 v20, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB29_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 +; VI-NEXT: v_or_b32_sdwa v38, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 +; VI-NEXT: v_or_b32_sdwa v39, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 +; VI-NEXT: v_or_b32_sdwa v48, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v49 +; VI-NEXT: v_or_b32_sdwa v49, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v50 +; VI-NEXT: v_or_b32_sdwa v50, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 +; VI-NEXT: v_or_b32_sdwa v51, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 +; VI-NEXT: v_or_b32_sdwa v52, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; VI-NEXT: v_or_b32_sdwa v53, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v28, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v29, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v32 +; VI-NEXT: v_mov_b32_e32 v1, v33 +; VI-NEXT: v_mov_b32_e32 v2, v34 +; VI-NEXT: v_mov_b32_e32 v3, v35 +; VI-NEXT: v_mov_b32_e32 v4, v36 +; VI-NEXT: v_mov_b32_e32 v5, v37 +; VI-NEXT: v_mov_b32_e32 v6, v38 +; VI-NEXT: v_mov_b32_e32 v7, v39 +; VI-NEXT: v_mov_b32_e32 v8, v48 +; VI-NEXT: v_mov_b32_e32 v9, v49 +; VI-NEXT: v_mov_b32_e32 v10, v50 +; VI-NEXT: v_mov_b32_e32 v11, v51 +; VI-NEXT: v_mov_b32_e32 v12, v52 +; VI-NEXT: v_mov_b32_e32 v13, v53 +; VI-NEXT: v_mov_b32_e32 v14, v30 +; VI-NEXT: v_mov_b32_e32 v15, v31 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_branch .LBB29_2 +; +; GFX9-LABEL: bitcast_v30f32_to_v60i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v19, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v28, s19 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v29, s20 +; GFX9-NEXT: v_mov_b32_e32 v27, s21 +; GFX9-NEXT: v_mov_b32_e32 v26, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v24, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v21, s27 +; GFX9-NEXT: v_mov_b32_e32 v20, s28 +; GFX9-NEXT: v_mov_b32_e32 v16, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v50, v50, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v51, v51, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v52, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v28, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v33 +; GFX9-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-NEXT: v_mov_b32_e32 v3, v35 +; GFX9-NEXT: v_mov_b32_e32 v4, v36 +; GFX9-NEXT: v_mov_b32_e32 v5, v37 +; GFX9-NEXT: v_mov_b32_e32 v6, v38 +; GFX9-NEXT: v_mov_b32_e32 v7, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v48 +; GFX9-NEXT: v_mov_b32_e32 v9, v49 +; GFX9-NEXT: v_mov_b32_e32 v10, v50 +; GFX9-NEXT: v_mov_b32_e32 v11, v51 +; GFX9-NEXT: v_mov_b32_e32 v12, v52 +; GFX9-NEXT: v_mov_b32_e32 v13, v53 +; GFX9-NEXT: v_mov_b32_e32 v14, v30 +; GFX9-NEXT: v_mov_b32_e32 v15, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_branch .LBB29_2 +; +; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v17, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v15, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-TRUE16-NEXT: .LBB29_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v35, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v37, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v39, v39, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v49, v49, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v81, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v80, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v71, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v70, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v49 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v35 :: v_dual_and_b32 v4, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v36 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v48 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, v34 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB29_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB29_2 +; +; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-FAKE16-NEXT: .LBB29_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v39, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v49, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v82, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v81, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v71, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v70, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v35 :: v_dual_and_b32 v4, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v48 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v34 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB29_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: s_branch .LBB29_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <30 x float> %a1 to <60 x i16> + br label %end + +cmp.false: + %a3 = bitcast <30 x float> %a to <60 x i16> + br label %end + +end: + %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x i16> %phi +} + define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v60i16_to_v30f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v34 -; GCN-NEXT: v_or_b32_e32 v1, v1, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v41 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v63 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v62 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v61 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v60 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v59 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v58 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v57 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v47 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v44 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v43 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v56 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v30 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: .LBB15_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v56 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v41, v4 -; GCN-NEXT: v_or_b32_e32 v5, v63, v5 -; GCN-NEXT: v_or_b32_e32 v6, v62, v6 -; GCN-NEXT: v_or_b32_e32 v7, v61, v7 -; GCN-NEXT: v_or_b32_e32 v8, v60, v8 -; GCN-NEXT: v_or_b32_e32 v9, v59, v9 -; GCN-NEXT: v_or_b32_e32 v10, v58, v10 -; GCN-NEXT: v_or_b32_e32 v11, v57, v11 -; GCN-NEXT: v_or_b32_e32 v12, v47, v12 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v30, v13 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v30, v14 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v30, v15 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v30, v18 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v30, v19 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v30, v20 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v30, v21 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v30, v22 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v30, v23 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v30, v24 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v30, v25 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v30, v26 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v30, v27 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v30, v29 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 -; GCN-NEXT: .LBB15_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v60i16_to_v30f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v59 +; SI-NEXT: v_or_b32_e32 v6, v6, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v49 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v56 +; SI-NEXT: v_or_b32_e32 v13, v13, v47 +; SI-NEXT: v_or_b32_e32 v14, v14, v46 +; SI-NEXT: v_or_b32_e32 v15, v15, v38 +; SI-NEXT: v_or_b32_e32 v16, v16, v45 +; SI-NEXT: v_or_b32_e32 v17, v17, v44 +; SI-NEXT: v_or_b32_e32 v19, v19, v42 +; SI-NEXT: v_or_b32_e32 v20, v20, v41 +; SI-NEXT: v_or_b32_e32 v21, v21, v40 +; SI-NEXT: v_or_b32_e32 v22, v22, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_or_b32_e32 v25, v25, v34 +; SI-NEXT: v_or_b32_e32 v26, v26, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v32 +; SI-NEXT: v_or_b32_e32 v28, v28, v63 +; SI-NEXT: v_or_b32_e32 v29, v29, v62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB30_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v18, v43, v18 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 +; SI-NEXT: v_or_b32_e32 v6, v50, v6 +; SI-NEXT: v_or_b32_e32 v7, v49, v7 +; SI-NEXT: v_or_b32_e32 v8, v48, v8 +; SI-NEXT: v_or_b32_e32 v9, v58, v9 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v56, v12 +; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v14, v46, v14 +; SI-NEXT: v_or_b32_e32 v15, v38, v15 +; SI-NEXT: v_or_b32_e32 v16, v45, v16 +; SI-NEXT: v_or_b32_e32 v17, v44, v17 +; SI-NEXT: v_or_b32_e32 v19, v42, v19 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v21, v40, v21 +; SI-NEXT: v_or_b32_e32 v22, v37, v22 +; SI-NEXT: v_or_b32_e32 v23, v36, v23 +; SI-NEXT: v_or_b32_e32 v24, v35, v24 +; SI-NEXT: v_or_b32_e32 v25, v34, v25 +; SI-NEXT: v_or_b32_e32 v26, v33, v26 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: v_or_b32_e32 v28, v63, v28 +; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; SI-NEXT: .LBB30_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v30f32: ; VI: ; %bb.0: @@ -8587,7 +18267,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v29, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -8680,9 +18360,9 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB15_2: ; %Flow +; VI-NEXT: .LBB30_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_4 +; VI-NEXT: s_cbranch_execz .LBB30_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v29, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v61 @@ -8775,7 +18455,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v30, 3, v32 ; VI-NEXT: v_add_u16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v29, v30, v29 -; VI-NEXT: .LBB15_4: ; %end +; VI-NEXT: .LBB30_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -8898,7 +18578,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -9054,9 +18734,9 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: .LBB15_2: ; %Flow +; GFX9-NEXT: .LBB30_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload @@ -9168,926 +18848,5135 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 ; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_4: ; %end +; GFX9-NEXT: .LBB30_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB30_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB30_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <60 x i16> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + +define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60i16_to_v30f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v28 +; SI-NEXT: v_mov_b32_e32 v33, v26 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v16 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v12, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v17, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v18, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v19, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_or_b32_e32 v20, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v58 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v28 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_or_b32_e32 v22, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v23, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v24, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v25, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v26, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v27, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v28, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v29, v0, v63 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v44 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v58 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v57, v56 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v62, v60 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: v_mov_b32_e32 v31, v61 +; SI-NEXT: v_mov_b32_e32 v61, v63 +; SI-NEXT: v_mov_b32_e32 v63, v60 +; SI-NEXT: v_mov_b32_e32 v60, v62 +; SI-NEXT: v_mov_b32_e32 v62, v47 +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: v_mov_b32_e32 v56, v57 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v58, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v60i16_to_v30f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v39, v15 +; VI-NEXT: v_mov_b32_e32 v37, v14 +; VI-NEXT: v_mov_b32_e32 v35, v13 +; VI-NEXT: v_mov_b32_e32 v34, v12 +; VI-NEXT: v_mov_b32_e32 v33, v11 +; VI-NEXT: v_mov_b32_e32 v32, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v38, v8 +; VI-NEXT: v_mov_b32_e32 v48, v7 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v52, v3 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v55, v1 +; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: v_or_b32_sdwa v14, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_or_b32_sdwa v22, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v24, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v25, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v26, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v54 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v60i16_to_v30f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v15 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v12 +; GFX9-NEXT: v_mov_b32_e32 v36, v11 +; GFX9-NEXT: v_mov_b32_e32 v37, v10 +; GFX9-NEXT: v_mov_b32_e32 v38, v9 +; GFX9-NEXT: v_mov_b32_e32 v39, v8 +; GFX9-NEXT: v_mov_b32_e32 v48, v7 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v50, v5 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v52, v3 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v54, v1 +; GFX9-NEXT: v_mov_b32_e32 v55, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v63, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v33, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v9 :: v_dual_mov_b32 v35, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v7 :: v_dual_mov_b32 v37, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v5 :: v_dual_mov_b32 v39, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v3 :: v_dual_mov_b32 v49, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v1 :: v_dual_mov_b32 v51, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v28, 16, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v34, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB31_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB31_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB31_2 +; +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB31_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB31_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB31_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <60 x i16> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + +define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v30f32_to_v60f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: v_mov_b32_e32 v34, v29 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: v_mov_b32_e32 v63, v25 +; SI-NEXT: v_mov_b32_e32 v59, v26 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: .LBB32_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v30f32_to_v60f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB32_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB32_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB32_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB32_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v30f32_to_v60f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB32_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB32_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB32_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB32_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 +; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB32_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <30 x float> %a1 to <60 x half> + br label %end + +cmp.false: + %a3 = bitcast <30 x float> %a to <60 x half> + br label %end + +end: + %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x half> %phi +} + +define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30f32_to_v60f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_readfirstlane_b32 s45, v1 +; SI-NEXT: v_readfirstlane_b32 s44, v2 +; SI-NEXT: v_readfirstlane_b32 s43, v3 +; SI-NEXT: v_readfirstlane_b32 s42, v4 +; SI-NEXT: v_readfirstlane_b32 s41, v5 +; SI-NEXT: v_readfirstlane_b32 s40, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v7 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s13, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s11, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s7, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s44, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s45, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, s11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, s12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v6, s16, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_add_f32_e64 v5, s19, 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_add_f32_e64 v16, s12, 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e64 v11, s22, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_add_f32_e64 v40, s6, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e64 v17, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s45, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s40, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_add_f32_e64 v19, s26, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; SI-NEXT: v_add_f32_e64 v48, s8, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v43 +; SI-NEXT: v_add_f32_e64 v15, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s41, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_add_f32_e64 v18, s13, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v46 +; SI-NEXT: v_add_f32_e64 v13, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v27, s42, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_add_f32_e64 v37, s10, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v56 +; SI-NEXT: v_add_f32_e64 v29, s43, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v58 +; SI-NEXT: v_add_f32_e64 v9, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v31, s44, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s14, 1.0 +; SI-NEXT: v_add_f32_e64 v33, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v52, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v44, s9, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v4 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v4, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 +; SI-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: buffer_store_dword v6, v4, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_add_i32_e32 v10, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 +; SI-NEXT: v_add_i32_e32 v6, vcc, 12, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 +; SI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 +; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 +; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 +; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v52 +; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 +; SI-NEXT: v_add_i32_e32 v6, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 +; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v18 +; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 +; SI-NEXT: v_add_i32_e32 v6, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 +; SI-NEXT: v_add_i32_e32 v6, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 +; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 +; SI-NEXT: v_add_i32_e32 v6, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v29 +; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v62 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v61 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x6c, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v30f32_to_v60f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v19, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v28, s19 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v29, s20 +; VI-NEXT: v_mov_b32_e32 v27, s21 +; VI-NEXT: v_mov_b32_e32 v26, s22 +; VI-NEXT: v_mov_b32_e32 v25, s23 +; VI-NEXT: v_mov_b32_e32 v24, s24 +; VI-NEXT: v_mov_b32_e32 v23, s25 +; VI-NEXT: v_mov_b32_e32 v22, s26 +; VI-NEXT: v_mov_b32_e32 v21, s27 +; VI-NEXT: v_mov_b32_e32 v20, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB33_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 +; VI-NEXT: v_or_b32_sdwa v38, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 +; VI-NEXT: v_or_b32_sdwa v39, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 +; VI-NEXT: v_or_b32_sdwa v48, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v49 +; VI-NEXT: v_or_b32_sdwa v49, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v50 +; VI-NEXT: v_or_b32_sdwa v50, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 +; VI-NEXT: v_or_b32_sdwa v51, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 +; VI-NEXT: v_or_b32_sdwa v52, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; VI-NEXT: v_or_b32_sdwa v53, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v28, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v29, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v32 +; VI-NEXT: v_mov_b32_e32 v1, v33 +; VI-NEXT: v_mov_b32_e32 v2, v34 +; VI-NEXT: v_mov_b32_e32 v3, v35 +; VI-NEXT: v_mov_b32_e32 v4, v36 +; VI-NEXT: v_mov_b32_e32 v5, v37 +; VI-NEXT: v_mov_b32_e32 v6, v38 +; VI-NEXT: v_mov_b32_e32 v7, v39 +; VI-NEXT: v_mov_b32_e32 v8, v48 +; VI-NEXT: v_mov_b32_e32 v9, v49 +; VI-NEXT: v_mov_b32_e32 v10, v50 +; VI-NEXT: v_mov_b32_e32 v11, v51 +; VI-NEXT: v_mov_b32_e32 v12, v52 +; VI-NEXT: v_mov_b32_e32 v13, v53 +; VI-NEXT: v_mov_b32_e32 v14, v30 +; VI-NEXT: v_mov_b32_e32 v15, v31 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_branch .LBB33_2 +; +; GFX9-LABEL: bitcast_v30f32_to_v60f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v19, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v28, s19 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v29, s20 +; GFX9-NEXT: v_mov_b32_e32 v27, s21 +; GFX9-NEXT: v_mov_b32_e32 v26, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v24, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v21, s27 +; GFX9-NEXT: v_mov_b32_e32 v20, s28 +; GFX9-NEXT: v_mov_b32_e32 v16, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v50, v50, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v51, v51, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v52, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v28, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v33 +; GFX9-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-NEXT: v_mov_b32_e32 v3, v35 +; GFX9-NEXT: v_mov_b32_e32 v4, v36 +; GFX9-NEXT: v_mov_b32_e32 v5, v37 +; GFX9-NEXT: v_mov_b32_e32 v6, v38 +; GFX9-NEXT: v_mov_b32_e32 v7, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v48 +; GFX9-NEXT: v_mov_b32_e32 v9, v49 +; GFX9-NEXT: v_mov_b32_e32 v10, v50 +; GFX9-NEXT: v_mov_b32_e32 v11, v51 +; GFX9-NEXT: v_mov_b32_e32 v12, v52 +; GFX9-NEXT: v_mov_b32_e32 v13, v53 +; GFX9-NEXT: v_mov_b32_e32 v14, v30 +; GFX9-NEXT: v_mov_b32_e32 v15, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_branch .LBB33_2 ; -; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30f32: +; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60f16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB15_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v17, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v15, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-TRUE16-NEXT: .LBB33_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v35, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v37, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v39, v39, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v49, v49, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v81, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v80, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v71, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v70, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v49 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v35 :: v_dual_and_b32 v4, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v36 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v48 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, v34 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB33_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB33_2 ; -; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30f32: +; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60f16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB15_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-FAKE16-NEXT: .LBB33_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v39, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v49, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v82, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v81, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v71, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v70, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v35 :: v_dual_and_b32 v4, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v48 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v34 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB33_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: s_branch .LBB33_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <60 x i16> %a, splat (i16 3) - %a2 = bitcast <60 x i16> %a1 to <30 x float> + %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <30 x float> %a1 to <60 x half> br label %end cmp.false: - %a3 = bitcast <60 x i16> %a to <30 x float> + %a3 = bitcast <30 x float> %a to <60 x half> br label %end end: - %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <30 x float> %phi + %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x half> %phi } -define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v30f32_to_v60f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v36 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v50 -; GCN-NEXT: v_mov_b32_e32 v50, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v61 -; GCN-NEXT: v_mov_b32_e32 v61, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v63 -; GCN-NEXT: v_mov_b32_e32 v63, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: .LBB16_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: .LBB16_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v46 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v43 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v42 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v41 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v54 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v52 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v51 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v49 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v37 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v35 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v35, v30 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v33, v36, v33 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v37, v31 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v62 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v48, v34 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v49, v32 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v61 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v63 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v47, v46 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v56, v57, v56 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v58, v59, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v60f16_to_v30f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v23, v52, v23 +; SI-NEXT: v_or_b32_e32 v24, v50, v24 +; SI-NEXT: v_or_b32_e32 v25, v48, v25 +; SI-NEXT: v_or_b32_e32 v26, v38, v26 +; SI-NEXT: v_or_b32_e32 v27, v36, v27 +; SI-NEXT: v_or_b32_e32 v28, v34, v28 +; SI-NEXT: v_or_b32_e32 v29, v32, v29 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 +; SI-NEXT: v_or_b32_e32 v22, v62, v22 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v63 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v30f32_to_v60f16: +; VI-LABEL: bitcast_v60f16_to_v30f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v29 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v34, v27 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v36, v25 +; VI-NEXT: v_mov_b32_e32 v37, v24 +; VI-NEXT: v_mov_b32_e32 v38, v23 +; VI-NEXT: v_mov_b32_e32 v39, v22 +; VI-NEXT: v_mov_b32_e32 v48, v21 +; VI-NEXT: v_mov_b32_e32 v49, v20 +; VI-NEXT: v_mov_b32_e32 v50, v19 +; VI-NEXT: v_mov_b32_e32 v51, v18 +; VI-NEXT: v_mov_b32_e32 v52, v17 +; VI-NEXT: v_mov_b32_e32 v53, v16 +; VI-NEXT: v_mov_b32_e32 v54, v15 +; VI-NEXT: v_mov_b32_e32 v55, v14 +; VI-NEXT: v_mov_b32_e32 v40, v13 +; VI-NEXT: v_mov_b32_e32 v41, v12 +; VI-NEXT: v_mov_b32_e32 v42, v11 +; VI-NEXT: v_mov_b32_e32 v43, v10 +; VI-NEXT: v_mov_b32_e32 v44, v9 +; VI-NEXT: v_mov_b32_e32 v45, v8 +; VI-NEXT: v_mov_b32_e32 v46, v7 +; VI-NEXT: v_mov_b32_e32 v47, v6 +; VI-NEXT: v_mov_b32_e32 v56, v5 +; VI-NEXT: v_mov_b32_e32 v57, v4 +; VI-NEXT: v_mov_b32_e32 v58, v3 +; VI-NEXT: v_mov_b32_e32 v59, v2 +; VI-NEXT: v_mov_b32_e32 v60, v1 +; VI-NEXT: v_mov_b32_e32 v61, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v29, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 @@ -10116,199 +24005,268 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB16_2: ; %Flow +; VI-NEXT: .LBB34_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_4 +; VI-NEXT: s_cbranch_execz .LBB34_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB16_4: ; %end +; VI-NEXT: v_mov_b32_e32 v29, 0x200 +; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v61 +; VI-NEXT: v_add_f16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v60 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v59 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_f16_sdwa v3, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v58 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_add_f16_sdwa v4, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v57 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_add_f16_sdwa v5, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v56 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_add_f16_sdwa v6, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v47 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_add_f16_sdwa v7, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v46 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_add_f16_sdwa v8, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v45 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_add_f16_sdwa v9, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v44 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_f16_sdwa v10, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v43 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_add_f16_sdwa v11, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v42 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_add_f16_sdwa v12, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v41 +; VI-NEXT: v_or_b32_e32 v12, v13, v12 +; VI-NEXT: v_add_f16_sdwa v13, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v40 +; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_add_f16_sdwa v14, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v55 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v54 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v26, v27, v26 +; VI-NEXT: v_add_f16_sdwa v27, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: v_add_f16_sdwa v28, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v28, v30, v28 +; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v29, v30, v29 +; VI-NEXT: .LBB34_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v30f32_to_v60f16: +; GFX9-LABEL: bitcast_v60f16_to_v30f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v48, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: v_mov_b32_e32 v43, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_mov_b32_e32 v44, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v8 +; GFX9-NEXT: v_mov_b32_e32 v46, v7 +; GFX9-NEXT: v_mov_b32_e32 v47, v6 +; GFX9-NEXT: v_mov_b32_e32 v56, v5 +; GFX9-NEXT: v_mov_b32_e32 v57, v4 +; GFX9-NEXT: v_mov_b32_e32 v58, v3 +; GFX9-NEXT: v_mov_b32_e32 v59, v2 +; GFX9-NEXT: v_mov_b32_e32 v60, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB34_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 @@ -10330,162 +24288,234 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB16_2: ; %Flow +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: .LBB34_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB16_4: ; %end +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: s_movk_i32 s7, 0x200 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v28, v28, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 +; GFX9-NEXT: v_pk_add_f16 v29, v29, s7 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB34_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 -; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60f16: +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30f32: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -10493,1139 +24523,983 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60f16: +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30f32: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <30 x float> %a1 to <60 x half> - br label %end - -cmp.false: - %a3 = bitcast <30 x float> %a to <60 x half> - br label %end - -end: - %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <60 x half> %phi -} - -define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v60f16_to_v30f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v46 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v47 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v44 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v45 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v60 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; GCN-NEXT: v_or_b32_e32 v0, v58, v0 -; GCN-NEXT: v_or_b32_e32 v1, v56, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v50 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: v_or_b32_e32 v18, v46, v18 -; GCN-NEXT: v_or_b32_e32 v19, v44, v19 -; GCN-NEXT: v_or_b32_e32 v20, v43, v20 -; GCN-NEXT: v_or_b32_e32 v21, v41, v21 -; GCN-NEXT: v_or_b32_e32 v22, v48, v22 -; GCN-NEXT: v_or_b32_e32 v23, v38, v23 -; GCN-NEXT: v_or_b32_e32 v24, v36, v24 -; GCN-NEXT: v_or_b32_e32 v25, v35, v25 -; GCN-NEXT: v_or_b32_e32 v26, v37, v26 -; GCN-NEXT: v_or_b32_e32 v27, v39, v27 -; GCN-NEXT: v_or_b32_e32 v28, v49, v28 -; GCN-NEXT: v_or_b32_e32 v29, v51, v29 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: .LBB17_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v56 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v46 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v44 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_or_b32_e32 v17, v19, v18 -; GCN-NEXT: v_or_b32_e32 v18, v21, v20 -; GCN-NEXT: v_or_b32_e32 v19, v23, v22 -; GCN-NEXT: v_or_b32_e32 v20, v25, v24 -; GCN-NEXT: v_or_b32_e32 v21, v27, v26 -; GCN-NEXT: v_or_b32_e32 v22, v29, v28 -; GCN-NEXT: v_or_b32_e32 v23, v31, v30 -; GCN-NEXT: v_or_b32_e32 v24, v33, v32 -; GCN-NEXT: v_or_b32_e32 v25, v35, v34 -; GCN-NEXT: v_or_b32_e32 v26, v37, v36 -; GCN-NEXT: v_or_b32_e32 v27, v39, v38 -; GCN-NEXT: v_or_b32_e32 v28, v49, v48 -; GCN-NEXT: v_or_b32_e32 v29, v51, v50 -; GCN-NEXT: .LBB17_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v60f16_to_v30f32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v29 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v34, v27 -; VI-NEXT: v_mov_b32_e32 v35, v26 -; VI-NEXT: v_mov_b32_e32 v36, v25 -; VI-NEXT: v_mov_b32_e32 v37, v24 -; VI-NEXT: v_mov_b32_e32 v38, v23 -; VI-NEXT: v_mov_b32_e32 v39, v22 -; VI-NEXT: v_mov_b32_e32 v48, v21 -; VI-NEXT: v_mov_b32_e32 v49, v20 -; VI-NEXT: v_mov_b32_e32 v50, v19 -; VI-NEXT: v_mov_b32_e32 v51, v18 -; VI-NEXT: v_mov_b32_e32 v52, v17 -; VI-NEXT: v_mov_b32_e32 v53, v16 -; VI-NEXT: v_mov_b32_e32 v54, v15 -; VI-NEXT: v_mov_b32_e32 v55, v14 -; VI-NEXT: v_mov_b32_e32 v40, v13 -; VI-NEXT: v_mov_b32_e32 v41, v12 -; VI-NEXT: v_mov_b32_e32 v42, v11 -; VI-NEXT: v_mov_b32_e32 v43, v10 -; VI-NEXT: v_mov_b32_e32 v44, v9 -; VI-NEXT: v_mov_b32_e32 v45, v8 -; VI-NEXT: v_mov_b32_e32 v46, v7 -; VI-NEXT: v_mov_b32_e32 v47, v6 -; VI-NEXT: v_mov_b32_e32 v56, v5 -; VI-NEXT: v_mov_b32_e32 v57, v4 -; VI-NEXT: v_mov_b32_e32 v58, v3 -; VI-NEXT: v_mov_b32_e32 v59, v2 -; VI-NEXT: v_mov_b32_e32 v60, v1 -; VI-NEXT: v_mov_b32_e32 v61, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <60 x half> %a, splat (half 0xH0200) + %a2 = bitcast <60 x half> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <60 x half> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + +define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60f16_to_v30f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB35_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_or_b32_e32 v10, v32, v10 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v13, v43, v13 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_mov_b32_e32 v57, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v47, v36 +; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_or_b32_e32 v11, v35, v11 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_or_b32_e32 v14, v55, v14 +; SI-NEXT: v_or_b32_e32 v15, v61, v15 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v21, v51, v21 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v17, v32, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_or_b32_e32 v9, v39, v9 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v54, v29 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: s_branch .LBB35_3 +; SI-NEXT: .LBB35_2: +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v47, v36 +; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v30, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v39 +; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v31, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: .LBB35_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v61, v40 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: s_cbranch_vccnz .LBB35_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_mov_b32_e32 v55, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: .LBB35_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v60f16_to_v30f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v34, v13 +; VI-NEXT: v_mov_b32_e32 v35, v12 +; VI-NEXT: v_mov_b32_e32 v36, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v38, v9 +; VI-NEXT: v_mov_b32_e32 v39, v8 +; VI-NEXT: v_mov_b32_e32 v48, v7 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v52, v3 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v54, v1 +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB35_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v29, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB17_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v29, 0x200 -; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v61 -; VI-NEXT: v_add_f16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v60 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v1, v3, v2 -; VI-NEXT: v_add_f16_sdwa v2, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v59 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_add_f16_sdwa v3, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v58 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 ; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_add_f16_sdwa v4, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v57 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 ; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: v_add_f16_sdwa v5, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v56 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 ; VI-NEXT: v_or_b32_e32 v5, v6, v5 -; VI-NEXT: v_add_f16_sdwa v6, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v47 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 ; VI-NEXT: v_or_b32_e32 v6, v7, v6 -; VI-NEXT: v_add_f16_sdwa v7, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v46 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 ; VI-NEXT: v_or_b32_e32 v7, v8, v7 -; VI-NEXT: v_add_f16_sdwa v8, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v45 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 ; VI-NEXT: v_or_b32_e32 v8, v9, v8 -; VI-NEXT: v_add_f16_sdwa v9, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, 0x200, v44 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: v_add_f16_sdwa v10, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, 0x200, v43 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 ; VI-NEXT: v_or_b32_e32 v10, v11, v10 -; VI-NEXT: v_add_f16_sdwa v11, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, 0x200, v42 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 ; VI-NEXT: v_or_b32_e32 v11, v12, v11 -; VI-NEXT: v_add_f16_sdwa v12, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v13, 0x200, v41 -; VI-NEXT: v_or_b32_e32 v12, v13, v12 -; VI-NEXT: v_add_f16_sdwa v13, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, 0x200, v40 -; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v29, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 ; VI-NEXT: v_add_f16_sdwa v14, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v15, 0x200, v55 ; VI-NEXT: v_or_b32_e32 v14, v15, v14 @@ -11674,422 +25548,2272 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v29, v30, v29 -; VI-NEXT: .LBB17_4: ; %end +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v60f16_to_v30f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v15 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v12 +; GFX9-NEXT: v_mov_b32_e32 v36, v11 +; GFX9-NEXT: v_mov_b32_e32 v37, v10 +; GFX9-NEXT: v_mov_b32_e32 v38, v9 +; GFX9-NEXT: v_mov_b32_e32 v39, v8 +; GFX9-NEXT: v_mov_b32_e32 v48, v7 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v50, v5 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v52, v3 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v54, v1 +; GFX9-NEXT: v_mov_b32_e32 v55, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v33, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v9 :: v_dual_mov_b32 v35, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v7 :: v_dual_mov_b32 v37, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v5 :: v_dual_mov_b32 v39, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v3 :: v_dual_mov_b32 v49, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v1 :: v_dual_mov_b32 v51, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v28, 16, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v34, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB35_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB35_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB35_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB35_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB35_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <60 x half> %a, splat (half 0xH0200) + %a2 = bitcast <60 x half> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <60 x half> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + +define <15 x double> @bitcast_v15i64_to_v15f64(<15 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v15i64_to_v15f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15i64_to_v15f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: .LBB36_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v60f16_to_v30f32: +; GFX9-LABEL: bitcast_v15i64_to_v15f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v39, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v48, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: v_mov_b32_e32 v43, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_mov_b32_e32 v44, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v8 -; GFX9-NEXT: v_mov_b32_e32 v46, v7 -; GFX9-NEXT: v_mov_b32_e32 v47, v6 -; GFX9-NEXT: v_mov_b32_e32 v56, v5 -; GFX9-NEXT: v_mov_b32_e32 v57, v4 -; GFX9-NEXT: v_mov_b32_e32 v58, v3 -; GFX9-NEXT: v_mov_b32_e32 v59, v2 -; GFX9-NEXT: v_mov_b32_e32 v60, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v15i64_to_v15f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <15 x i64> %a, splat (i64 3) + %a2 = bitcast <15 x i64> %a1 to <15 x double> + br label %end + +cmp.false: + %a3 = bitcast <15 x i64> %a to <15 x double> + br label %end + +end: + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi +} + +define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15i64_to_v15f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 +; +; VI-LABEL: bitcast_v15i64_to_v15f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: .LBB37_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 +; +; GFX9-LABEL: bitcast_v15i64_to_v15f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: .LBB17_2: ; %Flow +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: .LBB37_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 +; +; GFX11-LABEL: bitcast_v15i64_to_v15f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB37_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: .LBB37_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <15 x i64> %a, splat (i64 3) + %a2 = bitcast <15 x i64> %a1 to <15 x double> + br label %end + +cmp.false: + %a3 = bitcast <15 x i64> %a to <15 x double> + br label %end + +end: + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi +} + +define <15 x i64> @bitcast_v15f64_to_v15i64(<15 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v15f64_to_v15i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15f64_to_v15i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v15f64_to_v15i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: s_movk_i32 s7, 0x200 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v28, v28, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: v_pk_add_f16 v29, v29, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_4: ; %end +; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v15f64_to_v15i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: .LBB38_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <15 x i64> + br label %end + +cmp.false: + %a3 = bitcast <15 x double> %a to <15 x i64> + br label %end + +end: + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi +} + +define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15f64_to_v15i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v31, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: v_mov_b32_e32 v17, v31 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v15f64_to_v15i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v31, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: v_mov_b32_e32 v17, v31 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v15f64_to_v15i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: v_mov_b32_e32 v17, v31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v15f64_to_v15i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB39_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: .LBB39_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <15 x i64> + br label %end + +cmp.false: + %a3 = bitcast <15 x double> %a to <15 x i64> + br label %end + +end: + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi +} + +define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v15i64_to_v60i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v37, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v39, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v50, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v52, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v43, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v37, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v39, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v50, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v52, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v43, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v56 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15i64_to_v60i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB40_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB40_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB40_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v15i64_to_v60i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB40_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB40_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB40_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 +; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30f32: +; GFX11-TRUE16-LABEL: bitcast_v15i64_to_v60i16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -12097,835 +27821,2175 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB40_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30f32: +; GFX11-FAKE16-LABEL: bitcast_v15i64_to_v60i16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB40_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB40_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false -cmp.true: - %a1 = fadd <60 x half> %a, splat (half 0xH0200) - %a2 = bitcast <60 x half> %a1 to <30 x float> - br label %end - -cmp.false: - %a3 = bitcast <60 x half> %a to <30 x float> - br label %end - -end: - %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <30 x float> %phi -} - -define <15 x double> @bitcast_v15i64_to_v15f64(<15 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v15i64_to_v15f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v15i64_to_v15f64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; VI-NEXT: .LBB18_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v15i64_to_v15f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 -; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc -; GFX9-NEXT: .LBB18_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v15i64_to_v15f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-NEXT: .LBB18_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - cmp.true: %a1 = add <15 x i64> %a, splat (i64 3) - %a2 = bitcast <15 x i64> %a1 to <15 x double> + %a2 = bitcast <15 x i64> %a1 to <60 x i16> br label %end cmp.false: - %a3 = bitcast <15 x i64> %a to <15 x double> + %a3 = bitcast <15 x i64> %a to <60 x i16> br label %end end: - %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <15 x double> %phi + %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x i16> %phi } -define <15 x i64> @bitcast_v15f64_to_v15i64(<15 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v15f64_to_v15i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15i64_to_v60i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_readfirstlane_b32 s45, v1 +; SI-NEXT: v_readfirstlane_b32 s44, v2 +; SI-NEXT: v_readfirstlane_b32 s43, v3 +; SI-NEXT: v_readfirstlane_b32 s42, v4 +; SI-NEXT: v_readfirstlane_b32 s41, v5 +; SI-NEXT: v_readfirstlane_b32 s40, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v7 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s13, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s11, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v12 +; SI-NEXT: v_readfirstlane_b32 s9, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s45 +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s20 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s29, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s27, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s25, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s23, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s21, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s19, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 16 +; SI-NEXT: s_lshr_b32 s46, s6, 16 +; SI-NEXT: s_lshr_b32 s47, s8, 16 +; SI-NEXT: s_lshr_b32 s56, s10, 16 +; SI-NEXT: s_lshr_b32 s57, s12, 16 +; SI-NEXT: s_lshr_b32 s58, s14, 16 +; SI-NEXT: s_lshr_b32 s59, s40, 16 +; SI-NEXT: s_lshr_b32 s60, s42, 16 +; SI-NEXT: s_lshr_b32 s61, s44, 16 +; SI-NEXT: s_lshr_b32 s62, s29, 16 +; SI-NEXT: s_lshr_b32 s63, s27, 16 +; SI-NEXT: s_lshr_b32 s72, s25, 16 +; SI-NEXT: s_lshr_b32 s73, s23, 16 +; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s45, s45, 3 +; SI-NEXT: s_addc_u32 s44, s44, 0 +; SI-NEXT: s_add_u32 s43, s43, 3 +; SI-NEXT: s_addc_u32 s42, s42, 0 +; SI-NEXT: s_add_u32 s41, s41, 3 +; SI-NEXT: s_addc_u32 s40, s40, 0 +; SI-NEXT: s_add_u32 s15, s15, 3 +; SI-NEXT: s_addc_u32 s14, s14, 0 +; SI-NEXT: s_add_u32 s13, s13, 3 +; SI-NEXT: s_addc_u32 s12, s12, 0 +; SI-NEXT: s_add_u32 s11, s11, 3 +; SI-NEXT: s_addc_u32 s10, s10, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s45 +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s20 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s29, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s27, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s25, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s23, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s21, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s19, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 16 +; SI-NEXT: s_lshr_b32 s46, s6, 16 +; SI-NEXT: s_lshr_b32 s47, s8, 16 +; SI-NEXT: s_lshr_b32 s56, s10, 16 +; SI-NEXT: s_lshr_b32 s57, s12, 16 +; SI-NEXT: s_lshr_b32 s58, s14, 16 +; SI-NEXT: s_lshr_b32 s59, s40, 16 +; SI-NEXT: s_lshr_b32 s60, s42, 16 +; SI-NEXT: s_lshr_b32 s61, s44, 16 +; SI-NEXT: s_lshr_b32 s62, s29, 16 +; SI-NEXT: s_lshr_b32 s63, s27, 16 +; SI-NEXT: s_lshr_b32 s72, s25, 16 +; SI-NEXT: s_lshr_b32 s73, s23, 16 +; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v16, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s75, 16 +; SI-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v13, s4, v13 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 +; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v14, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s45, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s44, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s42, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: s_branch .LBB41_2 ; -; VI-LABEL: bitcast_v15f64_to_v15i64: +; VI-LABEL: bitcast_v15i64_to_v60i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; VI-NEXT: .LBB19_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v30, s30, 0 +; VI-NEXT: v_writelane_b32 v30, s31, 1 +; VI-NEXT: v_writelane_b32 v30, s34, 2 +; VI-NEXT: v_writelane_b32 v30, s35, 3 +; VI-NEXT: v_writelane_b32 v30, s36, 4 +; VI-NEXT: v_writelane_b32 v30, s37, 5 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_writelane_b32 v30, s38, 6 +; VI-NEXT: v_readfirstlane_b32 s45, v0 +; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: v_readfirstlane_b32 s43, v2 +; VI-NEXT: v_readfirstlane_b32 s42, v3 +; VI-NEXT: v_readfirstlane_b32 s41, v4 +; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s15, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s13, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s11, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s9, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s6, v14 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v15 +; VI-NEXT: v_writelane_b32 v30, s39, 7 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s57, s9, 16 +; VI-NEXT: s_lshr_b32 s58, s10, 16 +; VI-NEXT: s_lshr_b32 s59, s11, 16 +; VI-NEXT: s_lshr_b32 s60, s12, 16 +; VI-NEXT: s_lshr_b32 s61, s13, 16 +; VI-NEXT: s_lshr_b32 s62, s14, 16 +; VI-NEXT: s_lshr_b32 s63, s15, 16 +; VI-NEXT: s_lshr_b32 s72, s40, 16 +; VI-NEXT: s_lshr_b32 s73, s41, 16 +; VI-NEXT: s_lshr_b32 s74, s42, 16 +; VI-NEXT: s_lshr_b32 s75, s43, 16 +; VI-NEXT: s_lshr_b32 s76, s44, 16 +; VI-NEXT: s_lshr_b32 s77, s45, 16 +; VI-NEXT: s_lshr_b32 s78, s29, 16 +; VI-NEXT: s_lshr_b32 s79, s28, 16 +; VI-NEXT: s_lshr_b32 s88, s27, 16 +; VI-NEXT: s_lshr_b32 s89, s26, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 16 +; VI-NEXT: s_lshr_b32 s30, s23, 16 +; VI-NEXT: s_lshr_b32 s31, s22, 16 +; VI-NEXT: s_lshr_b32 s34, s21, 16 +; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s37, s18, 16 +; VI-NEXT: s_lshr_b32 s38, s17, 16 +; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s41, s41, 3 +; VI-NEXT: s_addc_u32 s40, s40, 0 +; VI-NEXT: s_add_u32 s43, s43, 3 +; VI-NEXT: s_addc_u32 s42, s42, 0 +; VI-NEXT: s_add_u32 s45, s45, 3 +; VI-NEXT: s_addc_u32 s44, s44, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s57, s9, 16 +; VI-NEXT: s_lshr_b32 s58, s10, 16 +; VI-NEXT: s_lshr_b32 s59, s11, 16 +; VI-NEXT: s_lshr_b32 s60, s12, 16 +; VI-NEXT: s_lshr_b32 s61, s13, 16 +; VI-NEXT: s_lshr_b32 s62, s14, 16 +; VI-NEXT: s_lshr_b32 s63, s15, 16 +; VI-NEXT: s_lshr_b32 s72, s40, 16 +; VI-NEXT: s_lshr_b32 s73, s41, 16 +; VI-NEXT: s_lshr_b32 s74, s42, 16 +; VI-NEXT: s_lshr_b32 s75, s43, 16 +; VI-NEXT: s_lshr_b32 s76, s44, 16 +; VI-NEXT: s_lshr_b32 s77, s45, 16 +; VI-NEXT: s_lshr_b32 s78, s29, 16 +; VI-NEXT: s_lshr_b32 s79, s28, 16 +; VI-NEXT: s_lshr_b32 s88, s27, 16 +; VI-NEXT: s_lshr_b32 s89, s26, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 16 +; VI-NEXT: s_lshr_b32 s30, s23, 16 +; VI-NEXT: s_lshr_b32 s31, s22, 16 +; VI-NEXT: s_lshr_b32 s34, s21, 16 +; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s37, s18, 16 +; VI-NEXT: s_lshr_b32 s38, s17, 16 +; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s39, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s38, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s37, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s36, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s35, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s34, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s31, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s30, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s91, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s90, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s89, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s88, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s79, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s78, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s45 +; VI-NEXT: s_lshl_b32 s29, s77, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s44 +; VI-NEXT: s_lshl_b32 s44, s76, 16 +; VI-NEXT: s_or_b32 s29, s29, s44 +; VI-NEXT: s_and_b32 s43, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s44, s75, 16 +; VI-NEXT: s_or_b32 s43, s43, s44 +; VI-NEXT: s_and_b32 s42, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s44, s74, 16 +; VI-NEXT: s_or_b32 s42, s42, s44 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s44, s73, 16 +; VI-NEXT: s_or_b32 s41, s41, s44 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s44, s72, 16 +; VI-NEXT: s_or_b32 s40, s40, s44 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s44, s63, 16 +; VI-NEXT: s_or_b32 s15, s15, s44 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s44, s62, 16 +; VI-NEXT: s_or_b32 s14, s14, s44 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s44, s61, 16 +; VI-NEXT: s_or_b32 s13, s13, s44 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s44, s60, 16 +; VI-NEXT: s_or_b32 s12, s12, s44 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s44, s59, 16 +; VI-NEXT: s_or_b32 s11, s11, s44 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s44, s58, 16 +; VI-NEXT: s_or_b32 s10, s10, s44 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s44, s57, 16 +; VI-NEXT: s_or_b32 s9, s9, s44 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s44, s56, 16 +; VI-NEXT: s_or_b32 s8, s8, s44 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s44, s47, 16 +; VI-NEXT: s_or_b32 s6, s6, s44 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s44, s46, 16 +; VI-NEXT: s_or_b32 s7, s7, s44 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s43 +; VI-NEXT: v_mov_b32_e32 v17, s42 +; VI-NEXT: v_mov_b32_e32 v18, s41 +; VI-NEXT: v_mov_b32_e32 v19, s40 +; VI-NEXT: v_mov_b32_e32 v20, s15 +; VI-NEXT: v_mov_b32_e32 v21, s14 +; VI-NEXT: v_mov_b32_e32 v22, s13 +; VI-NEXT: v_mov_b32_e32 v23, s12 +; VI-NEXT: v_mov_b32_e32 v24, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v26, s9 +; VI-NEXT: v_mov_b32_e32 v27, s8 +; VI-NEXT: v_mov_b32_e32 v28, s6 +; VI-NEXT: v_mov_b32_e32 v29, s7 +; VI-NEXT: v_readlane_b32 s39, v30, 7 +; VI-NEXT: v_readlane_b32 s38, v30, 6 +; VI-NEXT: v_readlane_b32 s37, v30, 5 +; VI-NEXT: v_readlane_b32 s36, v30, 4 +; VI-NEXT: v_readlane_b32 s35, v30, 3 +; VI-NEXT: v_readlane_b32 s34, v30, 2 +; VI-NEXT: v_readlane_b32 s31, v30, 1 +; VI-NEXT: v_readlane_b32 s30, v30, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB41_2 ; -; GFX9-LABEL: bitcast_v15f64_to_v15i64: +; GFX9-LABEL: bitcast_v15i64_to_v60i16_scalar: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v15f64_to_v15i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v30, s30, 0 +; GFX9-NEXT: v_writelane_b32 v30, s31, 1 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_writelane_b32 v30, s34, 2 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: v_readfirstlane_b32 s42, v12 +; GFX9-NEXT: v_readfirstlane_b32 s43, v13 +; GFX9-NEXT: v_readfirstlane_b32 s44, v14 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s45, v15 +; GFX9-NEXT: v_writelane_b32 v30, s35, 3 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_lshr_b32 s47, s44, 16 +; GFX9-NEXT: s_lshr_b32 s56, s43, 16 +; GFX9-NEXT: s_lshr_b32 s57, s42, 16 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s40, 16 +; GFX9-NEXT: s_lshr_b32 s60, s15, 16 +; GFX9-NEXT: s_lshr_b32 s61, s14, 16 +; GFX9-NEXT: s_lshr_b32 s62, s13, 16 +; GFX9-NEXT: s_lshr_b32 s63, s12, 16 +; GFX9-NEXT: s_lshr_b32 s72, s11, 16 +; GFX9-NEXT: s_lshr_b32 s73, s10, 16 +; GFX9-NEXT: s_lshr_b32 s74, s9, 16 +; GFX9-NEXT: s_lshr_b32 s75, s8, 16 +; GFX9-NEXT: s_lshr_b32 s76, s7, 16 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s78, s29, 16 +; GFX9-NEXT: s_lshr_b32 s79, s28, 16 +; GFX9-NEXT: s_lshr_b32 s88, s27, 16 +; GFX9-NEXT: s_lshr_b32 s89, s26, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 16 +; GFX9-NEXT: s_lshr_b32 s92, s23, 16 +; GFX9-NEXT: s_lshr_b32 s93, s22, 16 +; GFX9-NEXT: s_lshr_b32 s94, s21, 16 +; GFX9-NEXT: s_lshr_b32 s95, s20, 16 +; GFX9-NEXT: s_lshr_b32 s30, s19, 16 +; GFX9-NEXT: s_lshr_b32 s31, s18, 16 +; GFX9-NEXT: s_lshr_b32 s34, s17, 16 +; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s44, s44, 3 +; GFX9-NEXT: s_addc_u32 s45, s45, 0 +; GFX9-NEXT: s_add_u32 s42, s42, 3 +; GFX9-NEXT: s_addc_u32 s43, s43, 0 +; GFX9-NEXT: s_add_u32 s40, s40, 3 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_lshr_b32 s47, s44, 16 +; GFX9-NEXT: s_lshr_b32 s56, s43, 16 +; GFX9-NEXT: s_lshr_b32 s57, s42, 16 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s40, 16 +; GFX9-NEXT: s_lshr_b32 s60, s15, 16 +; GFX9-NEXT: s_lshr_b32 s61, s14, 16 +; GFX9-NEXT: s_lshr_b32 s62, s13, 16 +; GFX9-NEXT: s_lshr_b32 s63, s12, 16 +; GFX9-NEXT: s_lshr_b32 s72, s11, 16 +; GFX9-NEXT: s_lshr_b32 s73, s10, 16 +; GFX9-NEXT: s_lshr_b32 s74, s9, 16 +; GFX9-NEXT: s_lshr_b32 s75, s8, 16 +; GFX9-NEXT: s_lshr_b32 s76, s7, 16 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s78, s29, 16 +; GFX9-NEXT: s_lshr_b32 s79, s28, 16 +; GFX9-NEXT: s_lshr_b32 s88, s27, 16 +; GFX9-NEXT: s_lshr_b32 s89, s26, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 16 +; GFX9-NEXT: s_lshr_b32 s92, s23, 16 +; GFX9-NEXT: s_lshr_b32 s93, s22, 16 +; GFX9-NEXT: s_lshr_b32 s94, s21, 16 +; GFX9-NEXT: s_lshr_b32 s95, s20, 16 +; GFX9-NEXT: s_lshr_b32 s30, s19, 16 +; GFX9-NEXT: s_lshr_b32 s31, s18, 16 +; GFX9-NEXT: s_lshr_b32 s34, s17, 16 +; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s35 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s34 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s31 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s30 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 +; GFX9-NEXT: v_mov_b32_e32 v28, s42 +; GFX9-NEXT: v_mov_b32_e32 v29, s43 +; GFX9-NEXT: v_readlane_b32 s35, v30, 3 +; GFX9-NEXT: v_readlane_b32 s34, v30, 2 +; GFX9-NEXT: v_readlane_b32 s31, v30, 1 +; GFX9-NEXT: v_readlane_b32 s30, v30, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-TRUE16-LABEL: bitcast_v15i64_to_v60i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v11 +; GFX11-TRUE16-NEXT: s_mov_b32 s94, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s10, s10, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s12, s12, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s15, s15, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s14, s14, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-TRUE16-NEXT: .LBB41_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s15, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s14, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s13, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s12, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v19, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s13 :: v_dual_mov_b32 v21, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s11 :: v_dual_mov_b32 v23, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s9 :: v_dual_mov_b32 v25, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v27, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s6 :: v_dual_mov_b32 v29, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB41_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB41_2 +; +; GFX11-FAKE16-LABEL: bitcast_v15i64_to_v60i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v11 +; GFX11-FAKE16-NEXT: s_mov_b32 s94, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s15, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-FAKE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s15, s15, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s14, s14, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s12, s12, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s15, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-FAKE16-NEXT: .LBB41_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s12, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s15, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s14, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB41_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: s_branch .LBB41_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <15 x double> %a1 to <15 x i64> + %a1 = add <15 x i64> %a, splat (i64 3) + %a2 = bitcast <15 x i64> %a1 to <60 x i16> br label %end cmp.false: - %a3 = bitcast <15 x double> %a to <15 x i64> + %a3 = bitcast <15 x i64> %a to <60 x i16> br label %end end: - %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <15 x i64> %phi + %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x i16> %phi } -define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v15i64_to_v60i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v37, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v39, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v50, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v52, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v43, v6, v5, 16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_alignbit_b32 v46, v4, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v37, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v39, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v50, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v52, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v43, v6, v5, 16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_alignbit_b32 v46, v4, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v56 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v60 -; GCN-NEXT: v_or_b32_e32 v2, v2, v56 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v3, v3, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; GCN-NEXT: v_or_b32_e32 v4, v4, v59 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v5, v5, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v6, v6, v58 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v7, v7, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; GCN-NEXT: v_or_b32_e32 v8, v8, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v9, v9, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_or_b32_e32 v10, v10, v47 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v11, v11, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_or_b32_e32 v12, v12, v45 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v13, v13, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v14, v14, v44 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v15, v15, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v16, v16, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v17, v17, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v18, v18, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v19, v19, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v20, v20, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v21, v21, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v22, v22, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v24, v24, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v25, v25, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v26, v26, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v27, v27, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v28, v28, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v29, v29, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v30, v30, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v60i16_to_v15i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v59 +; SI-NEXT: v_or_b32_e32 v6, v6, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v49 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v56 +; SI-NEXT: v_or_b32_e32 v13, v13, v47 +; SI-NEXT: v_or_b32_e32 v14, v14, v46 +; SI-NEXT: v_or_b32_e32 v15, v15, v38 +; SI-NEXT: v_or_b32_e32 v16, v16, v45 +; SI-NEXT: v_or_b32_e32 v17, v17, v44 +; SI-NEXT: v_or_b32_e32 v19, v19, v42 +; SI-NEXT: v_or_b32_e32 v20, v20, v41 +; SI-NEXT: v_or_b32_e32 v21, v21, v40 +; SI-NEXT: v_or_b32_e32 v22, v22, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_or_b32_e32 v25, v25, v34 +; SI-NEXT: v_or_b32_e32 v26, v26, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v32 +; SI-NEXT: v_or_b32_e32 v28, v28, v63 +; SI-NEXT: v_or_b32_e32 v29, v29, v62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v18, v43, v18 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 +; SI-NEXT: v_or_b32_e32 v6, v50, v6 +; SI-NEXT: v_or_b32_e32 v7, v49, v7 +; SI-NEXT: v_or_b32_e32 v8, v48, v8 +; SI-NEXT: v_or_b32_e32 v9, v58, v9 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v56, v12 +; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v14, v46, v14 +; SI-NEXT: v_or_b32_e32 v15, v38, v15 +; SI-NEXT: v_or_b32_e32 v16, v45, v16 +; SI-NEXT: v_or_b32_e32 v17, v44, v17 +; SI-NEXT: v_or_b32_e32 v19, v42, v19 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v21, v40, v21 +; SI-NEXT: v_or_b32_e32 v22, v37, v22 +; SI-NEXT: v_or_b32_e32 v23, v36, v23 +; SI-NEXT: v_or_b32_e32 v24, v35, v24 +; SI-NEXT: v_or_b32_e32 v25, v34, v25 +; SI-NEXT: v_or_b32_e32 v26, v33, v26 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: v_or_b32_e32 v28, v63, v28 +; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v15i64_to_v60i16: +; VI-LABEL: bitcast_v60i16_to_v15i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v29 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v34, v27 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v36, v25 +; VI-NEXT: v_mov_b32_e32 v37, v24 +; VI-NEXT: v_mov_b32_e32 v38, v23 +; VI-NEXT: v_mov_b32_e32 v39, v22 +; VI-NEXT: v_mov_b32_e32 v48, v21 +; VI-NEXT: v_mov_b32_e32 v49, v20 +; VI-NEXT: v_mov_b32_e32 v50, v19 +; VI-NEXT: v_mov_b32_e32 v51, v18 +; VI-NEXT: v_mov_b32_e32 v52, v17 +; VI-NEXT: v_mov_b32_e32 v53, v16 +; VI-NEXT: v_mov_b32_e32 v54, v15 +; VI-NEXT: v_mov_b32_e32 v55, v14 +; VI-NEXT: v_mov_b32_e32 v40, v13 +; VI-NEXT: v_mov_b32_e32 v41, v12 +; VI-NEXT: v_mov_b32_e32 v42, v11 +; VI-NEXT: v_mov_b32_e32 v43, v10 +; VI-NEXT: v_mov_b32_e32 v44, v9 +; VI-NEXT: v_mov_b32_e32 v45, v8 +; VI-NEXT: v_mov_b32_e32 v46, v7 +; VI-NEXT: v_mov_b32_e32 v47, v6 +; VI-NEXT: v_mov_b32_e32 v56, v5 +; VI-NEXT: v_mov_b32_e32 v57, v4 +; VI-NEXT: v_mov_b32_e32 v58, v3 +; VI-NEXT: v_mov_b32_e32 v59, v2 +; VI-NEXT: v_mov_b32_e32 v60, v1 +; VI-NEXT: v_mov_b32_e32 v61, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v29, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 @@ -12954,199 +30018,268 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB20_2: ; %Flow +; VI-NEXT: .LBB42_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_4 +; VI-NEXT: s_cbranch_execz .LBB42_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB20_4: ; %end +; VI-NEXT: v_mov_b32_e32 v29, 3 +; VI-NEXT: v_add_u16_e32 v0, 3, v61 +; VI-NEXT: v_add_u16_sdwa v1, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 3, v60 +; VI-NEXT: v_add_u16_sdwa v3, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v59 +; VI-NEXT: v_add_u16_sdwa v3, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v58 +; VI-NEXT: v_add_u16_sdwa v4, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v57 +; VI-NEXT: v_add_u16_sdwa v5, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v56 +; VI-NEXT: v_add_u16_sdwa v6, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v47 +; VI-NEXT: v_add_u16_sdwa v7, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u16_e32 v7, 3, v46 +; VI-NEXT: v_add_u16_sdwa v8, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v45 +; VI-NEXT: v_add_u16_sdwa v9, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v44 +; VI-NEXT: v_add_u16_sdwa v10, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v43 +; VI-NEXT: v_add_u16_sdwa v11, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v42 +; VI-NEXT: v_add_u16_sdwa v12, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v41 +; VI-NEXT: v_add_u16_sdwa v13, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v40 +; VI-NEXT: v_add_u16_sdwa v14, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v55 +; VI-NEXT: v_add_u16_sdwa v15, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: v_add_u16_e32 v15, 3, v54 +; VI-NEXT: v_add_u16_sdwa v16, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: v_add_u16_e32 v16, 3, v53 +; VI-NEXT: v_add_u16_sdwa v17, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: v_add_u16_e32 v17, 3, v52 +; VI-NEXT: v_add_u16_sdwa v18, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_add_u16_e32 v18, 3, v51 +; VI-NEXT: v_add_u16_sdwa v19, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: v_add_u16_e32 v19, 3, v50 +; VI-NEXT: v_add_u16_sdwa v20, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_add_u16_e32 v20, 3, v49 +; VI-NEXT: v_add_u16_sdwa v21, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: v_add_u16_e32 v21, 3, v48 +; VI-NEXT: v_add_u16_sdwa v22, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v21, v22 +; VI-NEXT: v_add_u16_e32 v22, 3, v39 +; VI-NEXT: v_add_u16_sdwa v23, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v22, v23 +; VI-NEXT: v_add_u16_e32 v23, 3, v38 +; VI-NEXT: v_add_u16_sdwa v24, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v23, v24 +; VI-NEXT: v_add_u16_e32 v24, 3, v37 +; VI-NEXT: v_add_u16_sdwa v25, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v24, v25 +; VI-NEXT: v_add_u16_e32 v25, 3, v36 +; VI-NEXT: v_add_u16_sdwa v26, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v25, v26 +; VI-NEXT: v_add_u16_e32 v26, 3, v35 +; VI-NEXT: v_add_u16_sdwa v27, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v26, v27 +; VI-NEXT: v_add_u16_e32 v27, 3, v34 +; VI-NEXT: v_add_u16_sdwa v28, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v27, v28 +; VI-NEXT: v_add_u16_e32 v28, 3, v33 +; VI-NEXT: v_add_u16_sdwa v30, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v28, v30 +; VI-NEXT: v_add_u16_e32 v30, 3, v32 +; VI-NEXT: v_add_u16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v30, v29 +; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v15i64_to_v60i16: +; GFX9-LABEL: bitcast_v60i16_to_v15i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v48, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: v_mov_b32_e32 v43, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_mov_b32_e32 v44, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v8 +; GFX9-NEXT: v_mov_b32_e32 v46, v7 +; GFX9-NEXT: v_mov_b32_e32 v47, v6 +; GFX9-NEXT: v_mov_b32_e32 v56, v5 +; GFX9-NEXT: v_mov_b32_e32 v57, v4 +; GFX9-NEXT: v_mov_b32_e32 v58, v3 +; GFX9-NEXT: v_mov_b32_e32 v59, v2 +; GFX9-NEXT: v_mov_b32_e32 v60, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 @@ -13168,162 +30301,233 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB20_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 -; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB20_4: ; %end +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: .LBB42_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB42_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB42_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 -; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v15i64_to_v60i16: +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15i64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -13331,919 +30535,2163 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v15i64_to_v60i16: +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15i64: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <15 x i64> %a, splat (i64 3) - %a2 = bitcast <15 x i64> %a1 to <60 x i16> + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <15 x i64> + br label %end + +cmp.false: + %a3 = bitcast <60 x i16> %a to <15 x i64> + br label %end + +end: + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi +} + +define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60i16_to_v15i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v28 +; SI-NEXT: v_mov_b32_e32 v33, v26 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v16 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v12, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v17, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v18, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v19, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_or_b32_e32 v20, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v58 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v28 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_or_b32_e32 v22, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v23, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v24, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v25, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v26, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v27, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v28, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v29, v0, v63 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v44 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v58 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v57, v56 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v62, v60 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: v_mov_b32_e32 v31, v61 +; SI-NEXT: v_mov_b32_e32 v61, v63 +; SI-NEXT: v_mov_b32_e32 v63, v60 +; SI-NEXT: v_mov_b32_e32 v60, v62 +; SI-NEXT: v_mov_b32_e32 v62, v47 +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: v_mov_b32_e32 v56, v57 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v58, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v60i16_to_v15i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v39, v15 +; VI-NEXT: v_mov_b32_e32 v37, v14 +; VI-NEXT: v_mov_b32_e32 v35, v13 +; VI-NEXT: v_mov_b32_e32 v34, v12 +; VI-NEXT: v_mov_b32_e32 v33, v11 +; VI-NEXT: v_mov_b32_e32 v32, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v38, v8 +; VI-NEXT: v_mov_b32_e32 v48, v7 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v52, v3 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v55, v1 +; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: v_or_b32_sdwa v14, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_or_b32_sdwa v22, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v24, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v25, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v26, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v54 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v60i16_to_v15i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v15 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v12 +; GFX9-NEXT: v_mov_b32_e32 v36, v11 +; GFX9-NEXT: v_mov_b32_e32 v37, v10 +; GFX9-NEXT: v_mov_b32_e32 v38, v9 +; GFX9-NEXT: v_mov_b32_e32 v39, v8 +; GFX9-NEXT: v_mov_b32_e32 v48, v7 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v50, v5 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v52, v3 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v54, v1 +; GFX9-NEXT: v_mov_b32_e32 v55, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v63, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v33, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v9 :: v_dual_mov_b32 v35, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v7 :: v_dual_mov_b32 v37, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v5 :: v_dual_mov_b32 v39, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v3 :: v_dual_mov_b32 v49, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v1 :: v_dual_mov_b32 v51, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v28, 16, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v34, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB43_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB43_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <15 x i64> br label %end cmp.false: - %a3 = bitcast <15 x i64> %a to <60 x i16> + %a3 = bitcast <60 x i16> %a to <15 x i64> br label %end end: - %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <60 x i16> %phi + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi } -define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v60i16_to_v15i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v34 -; GCN-NEXT: v_or_b32_e32 v1, v1, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v41 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v63 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v62 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v61 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v60 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v59 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v58 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v57 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v47 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v44 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v43 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v56 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v30 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v56 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v41, v4 -; GCN-NEXT: v_or_b32_e32 v5, v63, v5 -; GCN-NEXT: v_or_b32_e32 v6, v62, v6 -; GCN-NEXT: v_or_b32_e32 v7, v61, v7 -; GCN-NEXT: v_or_b32_e32 v8, v60, v8 -; GCN-NEXT: v_or_b32_e32 v9, v59, v9 -; GCN-NEXT: v_or_b32_e32 v10, v58, v10 -; GCN-NEXT: v_or_b32_e32 v11, v57, v11 -; GCN-NEXT: v_or_b32_e32 v12, v47, v12 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v30, v13 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v30, v14 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v30, v15 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v30, v18 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v30, v19 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v30, v20 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v30, v21 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v30, v22 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v30, v23 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v30, v24 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v30, v25 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v30, v26 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v30, v27 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v30, v29 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v15i64_to_v60f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: v_mov_b32_e32 v34, v29 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: v_mov_b32_e32 v63, v25 +; SI-NEXT: v_mov_b32_e32 v59, v26 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v60i16_to_v15i64: +; VI-LABEL: bitcast_v15i64_to_v60f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v29 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v34, v27 -; VI-NEXT: v_mov_b32_e32 v35, v26 -; VI-NEXT: v_mov_b32_e32 v36, v25 -; VI-NEXT: v_mov_b32_e32 v37, v24 -; VI-NEXT: v_mov_b32_e32 v38, v23 -; VI-NEXT: v_mov_b32_e32 v39, v22 -; VI-NEXT: v_mov_b32_e32 v48, v21 -; VI-NEXT: v_mov_b32_e32 v49, v20 -; VI-NEXT: v_mov_b32_e32 v50, v19 -; VI-NEXT: v_mov_b32_e32 v51, v18 -; VI-NEXT: v_mov_b32_e32 v52, v17 -; VI-NEXT: v_mov_b32_e32 v53, v16 -; VI-NEXT: v_mov_b32_e32 v54, v15 -; VI-NEXT: v_mov_b32_e32 v55, v14 -; VI-NEXT: v_mov_b32_e32 v40, v13 -; VI-NEXT: v_mov_b32_e32 v41, v12 -; VI-NEXT: v_mov_b32_e32 v42, v11 -; VI-NEXT: v_mov_b32_e32 v43, v10 -; VI-NEXT: v_mov_b32_e32 v44, v9 -; VI-NEXT: v_mov_b32_e32 v45, v8 -; VI-NEXT: v_mov_b32_e32 v46, v7 -; VI-NEXT: v_mov_b32_e32 v47, v6 -; VI-NEXT: v_mov_b32_e32 v56, v5 -; VI-NEXT: v_mov_b32_e32 v57, v4 -; VI-NEXT: v_mov_b32_e32 v58, v3 -; VI-NEXT: v_mov_b32_e32 v59, v2 -; VI-NEXT: v_mov_b32_e32 v60, v1 -; VI-NEXT: v_mov_b32_e32 v61, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v29, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 @@ -14272,268 +32720,199 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB21_2: ; %Flow +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB44_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB44_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_4 +; VI-NEXT: s_cbranch_execz .LBB44_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v29, 3 -; VI-NEXT: v_add_u16_e32 v0, 3, v61 -; VI-NEXT: v_add_u16_sdwa v1, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v60 -; VI-NEXT: v_add_u16_sdwa v3, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: v_add_u16_e32 v2, 3, v59 -; VI-NEXT: v_add_u16_sdwa v3, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v3, 3, v58 -; VI-NEXT: v_add_u16_sdwa v4, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_add_u16_e32 v4, 3, v57 -; VI-NEXT: v_add_u16_sdwa v5, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_add_u16_e32 v5, 3, v56 -; VI-NEXT: v_add_u16_sdwa v6, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v5, v6 -; VI-NEXT: v_add_u16_e32 v6, 3, v47 -; VI-NEXT: v_add_u16_sdwa v7, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_add_u16_e32 v7, 3, v46 -; VI-NEXT: v_add_u16_sdwa v8, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v45 -; VI-NEXT: v_add_u16_sdwa v9, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v44 -; VI-NEXT: v_add_u16_sdwa v10, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v9, v10 -; VI-NEXT: v_add_u16_e32 v10, 3, v43 -; VI-NEXT: v_add_u16_sdwa v11, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v42 -; VI-NEXT: v_add_u16_sdwa v12, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v41 -; VI-NEXT: v_add_u16_sdwa v13, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v12, v13 -; VI-NEXT: v_add_u16_e32 v13, 3, v40 -; VI-NEXT: v_add_u16_sdwa v14, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v55 -; VI-NEXT: v_add_u16_sdwa v15, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: v_add_u16_e32 v15, 3, v54 -; VI-NEXT: v_add_u16_sdwa v16, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: v_add_u16_e32 v16, 3, v53 -; VI-NEXT: v_add_u16_sdwa v17, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v16, v16, v17 -; VI-NEXT: v_add_u16_e32 v17, 3, v52 -; VI-NEXT: v_add_u16_sdwa v18, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v17, v17, v18 -; VI-NEXT: v_add_u16_e32 v18, 3, v51 -; VI-NEXT: v_add_u16_sdwa v19, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v18, v18, v19 -; VI-NEXT: v_add_u16_e32 v19, 3, v50 -; VI-NEXT: v_add_u16_sdwa v20, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: v_add_u16_e32 v20, 3, v49 -; VI-NEXT: v_add_u16_sdwa v21, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v20, v20, v21 -; VI-NEXT: v_add_u16_e32 v21, 3, v48 -; VI-NEXT: v_add_u16_sdwa v22, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: v_add_u16_e32 v22, 3, v39 -; VI-NEXT: v_add_u16_sdwa v23, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v22, v22, v23 -; VI-NEXT: v_add_u16_e32 v23, 3, v38 -; VI-NEXT: v_add_u16_sdwa v24, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v23, v23, v24 -; VI-NEXT: v_add_u16_e32 v24, 3, v37 -; VI-NEXT: v_add_u16_sdwa v25, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: v_add_u16_e32 v25, 3, v36 -; VI-NEXT: v_add_u16_sdwa v26, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v25, v25, v26 -; VI-NEXT: v_add_u16_e32 v26, 3, v35 -; VI-NEXT: v_add_u16_sdwa v27, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v26, v26, v27 -; VI-NEXT: v_add_u16_e32 v27, 3, v34 -; VI-NEXT: v_add_u16_sdwa v28, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v27, v27, v28 -; VI-NEXT: v_add_u16_e32 v28, 3, v33 -; VI-NEXT: v_add_u16_sdwa v30, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v28, v28, v30 -; VI-NEXT: v_add_u16_e32 v30, 3, v32 -; VI-NEXT: v_add_u16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v29, v30, v29 -; VI-NEXT: .LBB21_4: ; %end +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB44_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v60i16_to_v15i64: +; GFX9-LABEL: bitcast_v15i64_to_v60f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v39, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v48, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: v_mov_b32_e32 v43, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_mov_b32_e32 v44, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v8 -; GFX9-NEXT: v_mov_b32_e32 v46, v7 -; GFX9-NEXT: v_mov_b32_e32 v47, v6 -; GFX9-NEXT: v_mov_b32_e32 v56, v5 -; GFX9-NEXT: v_mov_b32_e32 v57, v4 -; GFX9-NEXT: v_mov_b32_e32 v58, v3 -; GFX9-NEXT: v_mov_b32_e32 v59, v2 -; GFX9-NEXT: v_mov_b32_e32 v60, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 @@ -14543,1143 +32922,2753 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: .LBB21_2: ; %Flow +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB44_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB44_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: s_cbranch_execz .LBB44_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB44_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 +; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v15i64_to_v60f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB44_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v15i64_to_v60f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB44_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB44_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <15 x i64> %a, splat (i64 3) + %a2 = bitcast <15 x i64> %a1 to <60 x half> + br label %end + +cmp.false: + %a3 = bitcast <15 x i64> %a to <60 x half> + br label %end + +end: + %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x half> %phi +} + +define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15i64_to_v60f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_readfirstlane_b32 s44, v1 +; SI-NEXT: v_readfirstlane_b32 s45, v2 +; SI-NEXT: v_readfirstlane_b32 s42, v3 +; SI-NEXT: v_readfirstlane_b32 s43, v4 +; SI-NEXT: v_readfirstlane_b32 s40, v5 +; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s7, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s45, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s44, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s16, s4, 16 +; SI-NEXT: s_lshr_b32 s17, s5, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s46, s18, 16 +; SI-NEXT: s_lshr_b32 s47, s19, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s56, s20, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s58, s22, 16 +; SI-NEXT: s_lshr_b32 s59, s23, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s60, s24, 16 +; SI-NEXT: s_lshr_b32 s61, s25, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s62, s26, 16 +; SI-NEXT: s_lshr_b32 s63, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s72, s28, 16 +; SI-NEXT: s_lshr_b32 s73, s29, 16 +; SI-NEXT: s_add_u32 s44, s44, 3 +; SI-NEXT: s_addc_u32 s45, s45, 0 +; SI-NEXT: s_lshr_b32 s74, s44, 16 +; SI-NEXT: s_lshr_b32 s75, s45, 16 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_lshr_b32 s76, s42, 16 +; SI-NEXT: s_lshr_b32 s77, s43, 16 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_lshr_b32 s78, s40, 16 +; SI-NEXT: s_lshr_b32 s79, s41, 16 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_lshr_b32 s88, s14, 16 +; SI-NEXT: s_lshr_b32 s89, s15, 16 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_lshr_b32 s90, s12, 16 +; SI-NEXT: s_lshr_b32 s91, s13, 16 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_lshr_b32 s92, s10, 16 +; SI-NEXT: s_lshr_b32 s93, s11, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s94, s7, 16 +; SI-NEXT: s_lshr_b32 s95, s8, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16 +; SI-NEXT: s_lshr_b32 vcc_hi, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v57, s5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_hi +; SI-NEXT: v_cvt_f32_f16_e32 v2, vcc_lo +; SI-NEXT: v_cvt_f32_f16_e32 v3, s95 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s16 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_or_b32_e32 v59, v59, v60 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: buffer_store_dword v59, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v59, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v57, v57, v58 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: buffer_store_dword v57, v59, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v57, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v47, v47, v56 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v47, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v45, v45, v46 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: buffer_store_dword v45, v47, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v45, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v43, v44, v43 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v43, v45, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v43, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v41, v42, v41 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v41, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v55, v40, v55 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v53, v54, v53 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v51, v52, v51 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v49, v50, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v36, v38, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v37 +; SI-NEXT: v_add_i32_e32 v37, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_or_b32_e32 v34, v36, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: v_add_i32_e32 v35, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v32, v34, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v22, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_store_dword v17, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v15, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v15i64_to_v60f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v30, s30, 0 +; VI-NEXT: v_writelane_b32 v30, s31, 1 +; VI-NEXT: v_writelane_b32 v30, s34, 2 +; VI-NEXT: v_writelane_b32 v30, s35, 3 +; VI-NEXT: v_writelane_b32 v30, s36, 4 +; VI-NEXT: v_writelane_b32 v30, s37, 5 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_writelane_b32 v30, s38, 6 +; VI-NEXT: v_readfirstlane_b32 s45, v0 +; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: v_readfirstlane_b32 s43, v2 +; VI-NEXT: v_readfirstlane_b32 s42, v3 +; VI-NEXT: v_readfirstlane_b32 s41, v4 +; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s15, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s13, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s11, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s9, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s6, v14 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v15 +; VI-NEXT: v_writelane_b32 v30, s39, 7 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s57, s9, 16 +; VI-NEXT: s_lshr_b32 s58, s10, 16 +; VI-NEXT: s_lshr_b32 s59, s11, 16 +; VI-NEXT: s_lshr_b32 s60, s12, 16 +; VI-NEXT: s_lshr_b32 s61, s13, 16 +; VI-NEXT: s_lshr_b32 s62, s14, 16 +; VI-NEXT: s_lshr_b32 s63, s15, 16 +; VI-NEXT: s_lshr_b32 s72, s40, 16 +; VI-NEXT: s_lshr_b32 s73, s41, 16 +; VI-NEXT: s_lshr_b32 s74, s42, 16 +; VI-NEXT: s_lshr_b32 s75, s43, 16 +; VI-NEXT: s_lshr_b32 s76, s44, 16 +; VI-NEXT: s_lshr_b32 s77, s45, 16 +; VI-NEXT: s_lshr_b32 s78, s29, 16 +; VI-NEXT: s_lshr_b32 s79, s28, 16 +; VI-NEXT: s_lshr_b32 s88, s27, 16 +; VI-NEXT: s_lshr_b32 s89, s26, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 16 +; VI-NEXT: s_lshr_b32 s30, s23, 16 +; VI-NEXT: s_lshr_b32 s31, s22, 16 +; VI-NEXT: s_lshr_b32 s34, s21, 16 +; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s37, s18, 16 +; VI-NEXT: s_lshr_b32 s38, s17, 16 +; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s41, s41, 3 +; VI-NEXT: s_addc_u32 s40, s40, 0 +; VI-NEXT: s_add_u32 s43, s43, 3 +; VI-NEXT: s_addc_u32 s42, s42, 0 +; VI-NEXT: s_add_u32 s45, s45, 3 +; VI-NEXT: s_addc_u32 s44, s44, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s57, s9, 16 +; VI-NEXT: s_lshr_b32 s58, s10, 16 +; VI-NEXT: s_lshr_b32 s59, s11, 16 +; VI-NEXT: s_lshr_b32 s60, s12, 16 +; VI-NEXT: s_lshr_b32 s61, s13, 16 +; VI-NEXT: s_lshr_b32 s62, s14, 16 +; VI-NEXT: s_lshr_b32 s63, s15, 16 +; VI-NEXT: s_lshr_b32 s72, s40, 16 +; VI-NEXT: s_lshr_b32 s73, s41, 16 +; VI-NEXT: s_lshr_b32 s74, s42, 16 +; VI-NEXT: s_lshr_b32 s75, s43, 16 +; VI-NEXT: s_lshr_b32 s76, s44, 16 +; VI-NEXT: s_lshr_b32 s77, s45, 16 +; VI-NEXT: s_lshr_b32 s78, s29, 16 +; VI-NEXT: s_lshr_b32 s79, s28, 16 +; VI-NEXT: s_lshr_b32 s88, s27, 16 +; VI-NEXT: s_lshr_b32 s89, s26, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 16 +; VI-NEXT: s_lshr_b32 s30, s23, 16 +; VI-NEXT: s_lshr_b32 s31, s22, 16 +; VI-NEXT: s_lshr_b32 s34, s21, 16 +; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s37, s18, 16 +; VI-NEXT: s_lshr_b32 s38, s17, 16 +; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s39, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s38, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s37, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s36, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s35, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s34, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s31, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s30, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s91, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s90, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s89, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s88, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s79, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s78, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s45 +; VI-NEXT: s_lshl_b32 s29, s77, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s44 +; VI-NEXT: s_lshl_b32 s44, s76, 16 +; VI-NEXT: s_or_b32 s29, s29, s44 +; VI-NEXT: s_and_b32 s43, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s44, s75, 16 +; VI-NEXT: s_or_b32 s43, s43, s44 +; VI-NEXT: s_and_b32 s42, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s44, s74, 16 +; VI-NEXT: s_or_b32 s42, s42, s44 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s44, s73, 16 +; VI-NEXT: s_or_b32 s41, s41, s44 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s44, s72, 16 +; VI-NEXT: s_or_b32 s40, s40, s44 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s44, s63, 16 +; VI-NEXT: s_or_b32 s15, s15, s44 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s44, s62, 16 +; VI-NEXT: s_or_b32 s14, s14, s44 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s44, s61, 16 +; VI-NEXT: s_or_b32 s13, s13, s44 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s44, s60, 16 +; VI-NEXT: s_or_b32 s12, s12, s44 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s44, s59, 16 +; VI-NEXT: s_or_b32 s11, s11, s44 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s44, s58, 16 +; VI-NEXT: s_or_b32 s10, s10, s44 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s44, s57, 16 +; VI-NEXT: s_or_b32 s9, s9, s44 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s44, s56, 16 +; VI-NEXT: s_or_b32 s8, s8, s44 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s44, s47, 16 +; VI-NEXT: s_or_b32 s6, s6, s44 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s44, s46, 16 +; VI-NEXT: s_or_b32 s7, s7, s44 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s43 +; VI-NEXT: v_mov_b32_e32 v17, s42 +; VI-NEXT: v_mov_b32_e32 v18, s41 +; VI-NEXT: v_mov_b32_e32 v19, s40 +; VI-NEXT: v_mov_b32_e32 v20, s15 +; VI-NEXT: v_mov_b32_e32 v21, s14 +; VI-NEXT: v_mov_b32_e32 v22, s13 +; VI-NEXT: v_mov_b32_e32 v23, s12 +; VI-NEXT: v_mov_b32_e32 v24, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v26, s9 +; VI-NEXT: v_mov_b32_e32 v27, s8 +; VI-NEXT: v_mov_b32_e32 v28, s6 +; VI-NEXT: v_mov_b32_e32 v29, s7 +; VI-NEXT: v_readlane_b32 s39, v30, 7 +; VI-NEXT: v_readlane_b32 s38, v30, 6 +; VI-NEXT: v_readlane_b32 s37, v30, 5 +; VI-NEXT: v_readlane_b32 s36, v30, 4 +; VI-NEXT: v_readlane_b32 s35, v30, 3 +; VI-NEXT: v_readlane_b32 s34, v30, 2 +; VI-NEXT: v_readlane_b32 s31, v30, 1 +; VI-NEXT: v_readlane_b32 s30, v30, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v15i64_to_v60f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v30, s30, 0 +; GFX9-NEXT: v_writelane_b32 v30, s31, 1 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_writelane_b32 v30, s34, 2 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: v_readfirstlane_b32 s42, v12 +; GFX9-NEXT: v_readfirstlane_b32 s43, v13 +; GFX9-NEXT: v_readfirstlane_b32 s44, v14 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s45, v15 +; GFX9-NEXT: v_writelane_b32 v30, s35, 3 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_lshr_b32 s47, s44, 16 +; GFX9-NEXT: s_lshr_b32 s56, s43, 16 +; GFX9-NEXT: s_lshr_b32 s57, s42, 16 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s40, 16 +; GFX9-NEXT: s_lshr_b32 s60, s15, 16 +; GFX9-NEXT: s_lshr_b32 s61, s14, 16 +; GFX9-NEXT: s_lshr_b32 s62, s13, 16 +; GFX9-NEXT: s_lshr_b32 s63, s12, 16 +; GFX9-NEXT: s_lshr_b32 s72, s11, 16 +; GFX9-NEXT: s_lshr_b32 s73, s10, 16 +; GFX9-NEXT: s_lshr_b32 s74, s9, 16 +; GFX9-NEXT: s_lshr_b32 s75, s8, 16 +; GFX9-NEXT: s_lshr_b32 s76, s7, 16 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s78, s29, 16 +; GFX9-NEXT: s_lshr_b32 s79, s28, 16 +; GFX9-NEXT: s_lshr_b32 s88, s27, 16 +; GFX9-NEXT: s_lshr_b32 s89, s26, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 16 +; GFX9-NEXT: s_lshr_b32 s92, s23, 16 +; GFX9-NEXT: s_lshr_b32 s93, s22, 16 +; GFX9-NEXT: s_lshr_b32 s94, s21, 16 +; GFX9-NEXT: s_lshr_b32 s95, s20, 16 +; GFX9-NEXT: s_lshr_b32 s30, s19, 16 +; GFX9-NEXT: s_lshr_b32 s31, s18, 16 +; GFX9-NEXT: s_lshr_b32 s34, s17, 16 +; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s44, s44, 3 +; GFX9-NEXT: s_addc_u32 s45, s45, 0 +; GFX9-NEXT: s_add_u32 s42, s42, 3 +; GFX9-NEXT: s_addc_u32 s43, s43, 0 +; GFX9-NEXT: s_add_u32 s40, s40, 3 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_lshr_b32 s47, s44, 16 +; GFX9-NEXT: s_lshr_b32 s56, s43, 16 +; GFX9-NEXT: s_lshr_b32 s57, s42, 16 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s40, 16 +; GFX9-NEXT: s_lshr_b32 s60, s15, 16 +; GFX9-NEXT: s_lshr_b32 s61, s14, 16 +; GFX9-NEXT: s_lshr_b32 s62, s13, 16 +; GFX9-NEXT: s_lshr_b32 s63, s12, 16 +; GFX9-NEXT: s_lshr_b32 s72, s11, 16 +; GFX9-NEXT: s_lshr_b32 s73, s10, 16 +; GFX9-NEXT: s_lshr_b32 s74, s9, 16 +; GFX9-NEXT: s_lshr_b32 s75, s8, 16 +; GFX9-NEXT: s_lshr_b32 s76, s7, 16 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s78, s29, 16 +; GFX9-NEXT: s_lshr_b32 s79, s28, 16 +; GFX9-NEXT: s_lshr_b32 s88, s27, 16 +; GFX9-NEXT: s_lshr_b32 s89, s26, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 16 +; GFX9-NEXT: s_lshr_b32 s92, s23, 16 +; GFX9-NEXT: s_lshr_b32 s93, s22, 16 +; GFX9-NEXT: s_lshr_b32 s94, s21, 16 +; GFX9-NEXT: s_lshr_b32 s95, s20, 16 +; GFX9-NEXT: s_lshr_b32 s30, s19, 16 +; GFX9-NEXT: s_lshr_b32 s31, s18, 16 +; GFX9-NEXT: s_lshr_b32 s34, s17, 16 +; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s35 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s34 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s31 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s30 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 +; GFX9-NEXT: v_mov_b32_e32 v28, s42 +; GFX9-NEXT: v_mov_b32_e32 v29, s43 +; GFX9-NEXT: v_readlane_b32 s35, v30, 3 +; GFX9-NEXT: v_readlane_b32 s34, v30, 2 +; GFX9-NEXT: v_readlane_b32 s31, v30, 1 +; GFX9-NEXT: v_readlane_b32 s30, v30, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: s_branch .LBB45_2 ; -; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15i64: +; GFX11-TRUE16-LABEL: bitcast_v15i64_to_v60f16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v11 +; GFX11-TRUE16-NEXT: s_mov_b32 s94, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s10, s10, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s12, s12, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s15, s15, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s14, s14, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-TRUE16-NEXT: .LBB45_3: ; %end ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s15, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s14, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s13, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s12, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v19, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s13 :: v_dual_mov_b32 v21, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s11 :: v_dual_mov_b32 v23, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s9 :: v_dual_mov_b32 v25, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v27, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s6 :: v_dual_mov_b32 v29, s4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB45_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB45_2 ; -; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15i64: +; GFX11-FAKE16-LABEL: bitcast_v15i64_to_v60f16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v11 +; GFX11-FAKE16-NEXT: s_mov_b32 s94, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s15, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-FAKE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s15, s15, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s14, s14, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s12, s12, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s15, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-FAKE16-NEXT: .LBB45_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s12, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s15, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s14, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB45_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: s_branch .LBB45_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <60 x i16> %a, splat (i16 3) - %a2 = bitcast <60 x i16> %a1 to <15 x i64> + %a1 = add <15 x i64> %a, splat (i64 3) + %a2 = bitcast <15 x i64> %a1 to <60 x half> br label %end cmp.false: - %a3 = bitcast <60 x i16> %a to <15 x i64> + %a3 = bitcast <15 x i64> %a to <60 x half> br label %end end: - %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <15 x i64> %phi + %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x half> %phi } -define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v15i64_to_v60f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v36 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v50 -; GCN-NEXT: v_mov_b32_e32 v50, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v61 -; GCN-NEXT: v_mov_b32_e32 v61, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v63 -; GCN-NEXT: v_mov_b32_e32 v63, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v46 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v43 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v42 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v41 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v54 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v52 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v51 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v49 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v37 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v35 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v35, v30 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v33, v36, v33 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v37, v31 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v62 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v48, v34 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v49, v32 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v61 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v63 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v47, v46 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v56, v57, v56 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v58, v59, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v60f16_to_v15i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v23, v52, v23 +; SI-NEXT: v_or_b32_e32 v24, v50, v24 +; SI-NEXT: v_or_b32_e32 v25, v48, v25 +; SI-NEXT: v_or_b32_e32 v26, v38, v26 +; SI-NEXT: v_or_b32_e32 v27, v36, v27 +; SI-NEXT: v_or_b32_e32 v28, v34, v28 +; SI-NEXT: v_or_b32_e32 v29, v32, v29 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 +; SI-NEXT: v_or_b32_e32 v22, v62, v22 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v63 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v15i64_to_v60f16: +; VI-LABEL: bitcast_v60f16_to_v15i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v29 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v34, v27 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v36, v25 +; VI-NEXT: v_mov_b32_e32 v37, v24 +; VI-NEXT: v_mov_b32_e32 v38, v23 +; VI-NEXT: v_mov_b32_e32 v39, v22 +; VI-NEXT: v_mov_b32_e32 v48, v21 +; VI-NEXT: v_mov_b32_e32 v49, v20 +; VI-NEXT: v_mov_b32_e32 v50, v19 +; VI-NEXT: v_mov_b32_e32 v51, v18 +; VI-NEXT: v_mov_b32_e32 v52, v17 +; VI-NEXT: v_mov_b32_e32 v53, v16 +; VI-NEXT: v_mov_b32_e32 v54, v15 +; VI-NEXT: v_mov_b32_e32 v55, v14 +; VI-NEXT: v_mov_b32_e32 v40, v13 +; VI-NEXT: v_mov_b32_e32 v41, v12 +; VI-NEXT: v_mov_b32_e32 v42, v11 +; VI-NEXT: v_mov_b32_e32 v43, v10 +; VI-NEXT: v_mov_b32_e32 v44, v9 +; VI-NEXT: v_mov_b32_e32 v45, v8 +; VI-NEXT: v_mov_b32_e32 v46, v7 +; VI-NEXT: v_mov_b32_e32 v47, v6 +; VI-NEXT: v_mov_b32_e32 v56, v5 +; VI-NEXT: v_mov_b32_e32 v57, v4 +; VI-NEXT: v_mov_b32_e32 v58, v3 +; VI-NEXT: v_mov_b32_e32 v59, v2 +; VI-NEXT: v_mov_b32_e32 v60, v1 +; VI-NEXT: v_mov_b32_e32 v61, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB46_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v29, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 @@ -15708,199 +35697,268 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB22_2: ; %Flow +; VI-NEXT: .LBB46_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_4 +; VI-NEXT: s_cbranch_execz .LBB46_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB22_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v29, 0x200 +; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v61 +; VI-NEXT: v_add_f16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v60 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v59 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_f16_sdwa v3, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v58 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_add_f16_sdwa v4, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v57 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_add_f16_sdwa v5, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v56 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_add_f16_sdwa v6, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v47 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_add_f16_sdwa v7, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v46 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_add_f16_sdwa v8, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v45 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_add_f16_sdwa v9, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v44 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_f16_sdwa v10, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v43 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_add_f16_sdwa v11, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v42 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_add_f16_sdwa v12, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v41 +; VI-NEXT: v_or_b32_e32 v12, v13, v12 +; VI-NEXT: v_add_f16_sdwa v13, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v40 +; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_add_f16_sdwa v14, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v55 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v54 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v26, v27, v26 +; VI-NEXT: v_add_f16_sdwa v27, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: v_add_f16_sdwa v28, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v28, v30, v28 +; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v29, v30, v29 +; VI-NEXT: .LBB46_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v15i64_to_v60f16: +; GFX9-LABEL: bitcast_v60f16_to_v15i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v48, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: v_mov_b32_e32 v43, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_mov_b32_e32 v44, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v8 +; GFX9-NEXT: v_mov_b32_e32 v46, v7 +; GFX9-NEXT: v_mov_b32_e32 v47, v6 +; GFX9-NEXT: v_mov_b32_e32 v56, v5 +; GFX9-NEXT: v_mov_b32_e32 v57, v4 +; GFX9-NEXT: v_mov_b32_e32 v58, v3 +; GFX9-NEXT: v_mov_b32_e32 v59, v2 +; GFX9-NEXT: v_mov_b32_e32 v60, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB46_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 @@ -15922,162 +35980,234 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB22_2: ; %Flow +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: .LBB46_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_4 +; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 -; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB22_4: ; %end +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: s_movk_i32 s7, 0x200 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v28, v28, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 +; GFX9-NEXT: v_pk_add_f16 v29, v29, s7 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB46_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 -; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v15i64_to_v60f16: +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15i64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -16085,1185 +36215,983 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB22_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: bitcast_v15i64_to_v60f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <15 x i64> %a, splat (i64 3) - %a2 = bitcast <15 x i64> %a1 to <60 x half> - br label %end - -cmp.false: - %a3 = bitcast <15 x i64> %a to <60 x half> - br label %end - -end: - %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <60 x half> %phi -} - -define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v60f16_to_v15i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v46 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v47 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v44 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v45 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v60 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; GCN-NEXT: v_or_b32_e32 v0, v58, v0 -; GCN-NEXT: v_or_b32_e32 v1, v56, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v50 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: v_or_b32_e32 v18, v46, v18 -; GCN-NEXT: v_or_b32_e32 v19, v44, v19 -; GCN-NEXT: v_or_b32_e32 v20, v43, v20 -; GCN-NEXT: v_or_b32_e32 v21, v41, v21 -; GCN-NEXT: v_or_b32_e32 v22, v48, v22 -; GCN-NEXT: v_or_b32_e32 v23, v38, v23 -; GCN-NEXT: v_or_b32_e32 v24, v36, v24 -; GCN-NEXT: v_or_b32_e32 v25, v35, v25 -; GCN-NEXT: v_or_b32_e32 v26, v37, v26 -; GCN-NEXT: v_or_b32_e32 v27, v39, v27 -; GCN-NEXT: v_or_b32_e32 v28, v49, v28 -; GCN-NEXT: v_or_b32_e32 v29, v51, v29 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v56 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v46 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v44 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_or_b32_e32 v17, v19, v18 -; GCN-NEXT: v_or_b32_e32 v18, v21, v20 -; GCN-NEXT: v_or_b32_e32 v19, v23, v22 -; GCN-NEXT: v_or_b32_e32 v20, v25, v24 -; GCN-NEXT: v_or_b32_e32 v21, v27, v26 -; GCN-NEXT: v_or_b32_e32 v22, v29, v28 -; GCN-NEXT: v_or_b32_e32 v23, v31, v30 -; GCN-NEXT: v_or_b32_e32 v24, v33, v32 -; GCN-NEXT: v_or_b32_e32 v25, v35, v34 -; GCN-NEXT: v_or_b32_e32 v26, v37, v36 -; GCN-NEXT: v_or_b32_e32 v27, v39, v38 -; GCN-NEXT: v_or_b32_e32 v28, v49, v48 -; GCN-NEXT: v_or_b32_e32 v29, v51, v50 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v60f16_to_v15i64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v29 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v34, v27 -; VI-NEXT: v_mov_b32_e32 v35, v26 -; VI-NEXT: v_mov_b32_e32 v36, v25 -; VI-NEXT: v_mov_b32_e32 v37, v24 -; VI-NEXT: v_mov_b32_e32 v38, v23 -; VI-NEXT: v_mov_b32_e32 v39, v22 -; VI-NEXT: v_mov_b32_e32 v48, v21 -; VI-NEXT: v_mov_b32_e32 v49, v20 -; VI-NEXT: v_mov_b32_e32 v50, v19 -; VI-NEXT: v_mov_b32_e32 v51, v18 -; VI-NEXT: v_mov_b32_e32 v52, v17 -; VI-NEXT: v_mov_b32_e32 v53, v16 -; VI-NEXT: v_mov_b32_e32 v54, v15 -; VI-NEXT: v_mov_b32_e32 v55, v14 -; VI-NEXT: v_mov_b32_e32 v40, v13 -; VI-NEXT: v_mov_b32_e32 v41, v12 -; VI-NEXT: v_mov_b32_e32 v42, v11 -; VI-NEXT: v_mov_b32_e32 v43, v10 -; VI-NEXT: v_mov_b32_e32 v44, v9 -; VI-NEXT: v_mov_b32_e32 v45, v8 -; VI-NEXT: v_mov_b32_e32 v46, v7 -; VI-NEXT: v_mov_b32_e32 v47, v6 -; VI-NEXT: v_mov_b32_e32 v56, v5 -; VI-NEXT: v_mov_b32_e32 v57, v4 -; VI-NEXT: v_mov_b32_e32 v58, v3 -; VI-NEXT: v_mov_b32_e32 v59, v2 -; VI-NEXT: v_mov_b32_e32 v60, v1 -; VI-NEXT: v_mov_b32_e32 v61, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v29, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB23_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v29, 0x200 -; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v61 -; VI-NEXT: v_add_f16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v60 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <60 x half> %a, splat (half 0xH0200) + %a2 = bitcast <60 x half> %a1 to <15 x i64> + br label %end + +cmp.false: + %a3 = bitcast <60 x half> %a to <15 x i64> + br label %end + +end: + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi +} + +define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60f16_to_v15i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB47_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_or_b32_e32 v10, v32, v10 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v13, v43, v13 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_mov_b32_e32 v57, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v47, v36 +; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_or_b32_e32 v11, v35, v11 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_or_b32_e32 v14, v55, v14 +; SI-NEXT: v_or_b32_e32 v15, v61, v15 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v21, v51, v21 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v17, v32, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_or_b32_e32 v9, v39, v9 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v54, v29 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: s_branch .LBB47_3 +; SI-NEXT: .LBB47_2: +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v47, v36 +; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v30, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v39 +; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v31, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: .LBB47_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v61, v40 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: s_cbranch_vccnz .LBB47_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_mov_b32_e32 v55, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: .LBB47_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v60f16_to_v15i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v34, v13 +; VI-NEXT: v_mov_b32_e32 v35, v12 +; VI-NEXT: v_mov_b32_e32 v36, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v38, v9 +; VI-NEXT: v_mov_b32_e32 v39, v8 +; VI-NEXT: v_mov_b32_e32 v48, v7 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v52, v3 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v54, v1 +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB47_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v1, v3, v2 -; VI-NEXT: v_add_f16_sdwa v2, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v59 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_add_f16_sdwa v3, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v58 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 ; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_add_f16_sdwa v4, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v57 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 ; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: v_add_f16_sdwa v5, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v56 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 ; VI-NEXT: v_or_b32_e32 v5, v6, v5 -; VI-NEXT: v_add_f16_sdwa v6, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v47 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 ; VI-NEXT: v_or_b32_e32 v6, v7, v6 -; VI-NEXT: v_add_f16_sdwa v7, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v46 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 ; VI-NEXT: v_or_b32_e32 v7, v8, v7 -; VI-NEXT: v_add_f16_sdwa v8, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v45 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 ; VI-NEXT: v_or_b32_e32 v8, v9, v8 -; VI-NEXT: v_add_f16_sdwa v9, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, 0x200, v44 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: v_add_f16_sdwa v10, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, 0x200, v43 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 ; VI-NEXT: v_or_b32_e32 v10, v11, v10 -; VI-NEXT: v_add_f16_sdwa v11, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, 0x200, v42 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 ; VI-NEXT: v_or_b32_e32 v11, v12, v11 -; VI-NEXT: v_add_f16_sdwa v12, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v13, 0x200, v41 -; VI-NEXT: v_or_b32_e32 v12, v13, v12 -; VI-NEXT: v_add_f16_sdwa v13, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, 0x200, v40 -; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v29, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 ; VI-NEXT: v_add_f16_sdwa v14, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v15, 0x200, v55 ; VI-NEXT: v_or_b32_e32 v14, v15, v14 @@ -17312,916 +37240,3416 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v29, v30, v29 -; VI-NEXT: .LBB23_4: ; %end +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v60f16_to_v15i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v15 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v12 +; GFX9-NEXT: v_mov_b32_e32 v36, v11 +; GFX9-NEXT: v_mov_b32_e32 v37, v10 +; GFX9-NEXT: v_mov_b32_e32 v38, v9 +; GFX9-NEXT: v_mov_b32_e32 v39, v8 +; GFX9-NEXT: v_mov_b32_e32 v48, v7 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v50, v5 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v52, v3 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v54, v1 +; GFX9-NEXT: v_mov_b32_e32 v55, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB47_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB47_2 +; +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v33, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v9 :: v_dual_mov_b32 v35, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v7 :: v_dual_mov_b32 v37, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v5 :: v_dual_mov_b32 v39, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v3 :: v_dual_mov_b32 v49, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v1 :: v_dual_mov_b32 v51, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v28, 16, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v34, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB47_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB47_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <60 x half> %a, splat (half 0xH0200) + %a2 = bitcast <60 x half> %a1 to <15 x i64> + br label %end + +cmp.false: + %a3 = bitcast <60 x half> %a to <15 x i64> + br label %end + +end: + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi +} + +define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v15f64_to_v60i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v37, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v39, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v50, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v52, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v43, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v37, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v39, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v50, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v52, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v43, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v56 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15f64_to_v60i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB48_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v15f64_to_v60i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB48_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB48_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB48_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB48_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 +; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB48_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <60 x i16> + br label %end + +cmp.false: + %a3 = bitcast <15 x double> %a to <60 x i16> + br label %end + +end: + %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x i16> %phi +} + +define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15f64_to_v60i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_mov_b32_e32 v27, s16 +; SI-NEXT: v_mov_b32_e32 v28, s17 +; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: v_mov_b32_e32 v30, s19 +; SI-NEXT: v_mov_b32_e32 v25, s20 +; SI-NEXT: v_mov_b32_e32 v26, s21 +; SI-NEXT: v_mov_b32_e32 v23, s22 +; SI-NEXT: v_mov_b32_e32 v24, s23 +; SI-NEXT: v_mov_b32_e32 v21, s24 +; SI-NEXT: v_mov_b32_e32 v22, s25 +; SI-NEXT: v_mov_b32_e32 v19, s26 +; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v17, s28 +; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v31, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v32, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v34, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v37, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v39, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v52, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v55, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v41, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v43, v26, v25, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v30, v29, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v28, v27, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v22 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_alignbit_b32 v31, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v32, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v34, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v37, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v39, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v52, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v55, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v41, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v43, v26, v25, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v30, v29, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v28, v27, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v22 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v56 +; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v46 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v59 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v43 +; SI-NEXT: v_or_b32_e32 v25, v25, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v41 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v57 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v55 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v47 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v45 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v44 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v15f64_to_v60i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v17, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v29, s18 +; VI-NEXT: v_mov_b32_e32 v30, s19 +; VI-NEXT: v_mov_b32_e32 v27, s20 +; VI-NEXT: v_mov_b32_e32 v28, s21 +; VI-NEXT: v_mov_b32_e32 v25, s22 +; VI-NEXT: v_mov_b32_e32 v26, s23 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s25 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v20, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; VI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; VI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v17, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 +; VI-NEXT: v_or_b32_sdwa v38, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 +; VI-NEXT: v_or_b32_sdwa v39, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 +; VI-NEXT: v_or_b32_sdwa v48, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v49 +; VI-NEXT: v_or_b32_sdwa v49, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v50 +; VI-NEXT: v_or_b32_sdwa v50, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 +; VI-NEXT: v_or_b32_sdwa v51, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 +; VI-NEXT: v_or_b32_sdwa v52, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; VI-NEXT: v_or_b32_sdwa v53, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v59 +; VI-NEXT: v_or_b32_sdwa v30, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v28, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v29, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v32 +; VI-NEXT: v_mov_b32_e32 v1, v33 +; VI-NEXT: v_mov_b32_e32 v2, v34 +; VI-NEXT: v_mov_b32_e32 v3, v35 +; VI-NEXT: v_mov_b32_e32 v4, v36 +; VI-NEXT: v_mov_b32_e32 v5, v37 +; VI-NEXT: v_mov_b32_e32 v6, v38 +; VI-NEXT: v_mov_b32_e32 v7, v39 +; VI-NEXT: v_mov_b32_e32 v8, v48 +; VI-NEXT: v_mov_b32_e32 v9, v49 +; VI-NEXT: v_mov_b32_e32 v10, v50 +; VI-NEXT: v_mov_b32_e32 v11, v51 +; VI-NEXT: v_mov_b32_e32 v12, v52 +; VI-NEXT: v_mov_b32_e32 v13, v53 +; VI-NEXT: v_mov_b32_e32 v14, v30 +; VI-NEXT: v_mov_b32_e32 v15, v31 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_branch .LBB49_2 ; -; GFX9-LABEL: bitcast_v60f16_to_v15i64: +; GFX9-LABEL: bitcast_v15f64_to_v60i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v39, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v48, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: v_mov_b32_e32 v43, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_mov_b32_e32 v44, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v8 -; GFX9-NEXT: v_mov_b32_e32 v46, v7 -; GFX9-NEXT: v_mov_b32_e32 v47, v6 -; GFX9-NEXT: v_mov_b32_e32 v56, v5 -; GFX9-NEXT: v_mov_b32_e32 v57, v4 -; GFX9-NEXT: v_mov_b32_e32 v58, v3 -; GFX9-NEXT: v_mov_b32_e32 v59, v2 -; GFX9-NEXT: v_mov_b32_e32 v60, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v17, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 +; GFX9-NEXT: v_mov_b32_e32 v29, s18 +; GFX9-NEXT: v_mov_b32_e32 v30, s19 +; GFX9-NEXT: v_mov_b32_e32 v27, s20 +; GFX9-NEXT: v_mov_b32_e32 v28, s21 +; GFX9-NEXT: v_mov_b32_e32 v25, s22 +; GFX9-NEXT: v_mov_b32_e32 v26, s23 +; GFX9-NEXT: v_mov_b32_e32 v23, s24 +; GFX9-NEXT: v_mov_b32_e32 v24, s25 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v22, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v20, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX9-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX9-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v30 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v50, v50, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v30, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v51, v51, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v52, v52, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v28, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v33 +; GFX9-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-NEXT: v_mov_b32_e32 v3, v35 +; GFX9-NEXT: v_mov_b32_e32 v4, v36 +; GFX9-NEXT: v_mov_b32_e32 v5, v37 +; GFX9-NEXT: v_mov_b32_e32 v6, v38 +; GFX9-NEXT: v_mov_b32_e32 v7, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v48 +; GFX9-NEXT: v_mov_b32_e32 v9, v49 +; GFX9-NEXT: v_mov_b32_e32 v10, v50 +; GFX9-NEXT: v_mov_b32_e32 v11, v51 +; GFX9-NEXT: v_mov_b32_e32 v12, v52 +; GFX9-NEXT: v_mov_b32_e32 v13, v53 +; GFX9-NEXT: v_mov_b32_e32 v14, v30 +; GFX9-NEXT: v_mov_b32_e32 v15, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: .LBB23_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: s_movk_i32 s7, 0x200 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v28, v28, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: v_pk_add_f16 v29, v29, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_branch .LBB49_2 ; -; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15i64: +; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60i16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, s0 :: v_dual_mov_b32 v30, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, s2 :: v_dual_mov_b32 v28, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, s16 :: v_dual_mov_b32 v26, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, s18 :: v_dual_mov_b32 v24, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, s20 :: v_dual_mov_b32 v22, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, s22 :: v_dual_mov_b32 v20, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v18, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX11-TRUE16-NEXT: .LBB49_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v35, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v34, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v12, 16, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v81, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v80, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v70, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v48, v48, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v71, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_and_b32 v1, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v83, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v49, v49, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v49 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v36, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v38, v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v39, v39, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v0, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v36 :: v_dual_mov_b32 v1, v37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v9, v33 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 ; -; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15i64: +; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60i16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v31, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v29, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v27, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v23, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-FAKE16-NEXT: .LBB49_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v39, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v82, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v49, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v35 :: v_dual_and_b32 v4, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v81, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v70, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v34 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <60 x half> %a, splat (half 0xH0200) - %a2 = bitcast <60 x half> %a1 to <15 x i64> + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <60 x i16> br label %end cmp.false: - %a3 = bitcast <60 x half> %a to <15 x i64> + %a3 = bitcast <15 x double> %a to <60 x i16> br label %end end: - %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <15 x i64> %phi + %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x i16> %phi } -define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v15f64_to_v60i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v37, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v39, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v48, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v49, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v52, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v55, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v14 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v37, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v39, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v48, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v49, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v52, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v55, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v14 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v1, v1, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v60 -; GCN-NEXT: v_or_b32_e32 v2, v2, v44 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 4, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v3, v3, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 8, v0 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; GCN-NEXT: v_or_b32_e32 v4, v4, v59 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v5, v5, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v6, v6, v58 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 20, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v7, v7, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 24, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; GCN-NEXT: v_or_b32_e32 v8, v8, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 28, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v9, v9, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 32, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v10, v10, v56 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v11, v11, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 40, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_or_b32_e32 v12, v12, v47 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v13, v13, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v14, v14, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 52, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v15, v15, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 56, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_or_b32_e32 v16, v16, v45 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v17, v17, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v18, v18, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v19, v19, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v20, v20, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x4c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v21, v21, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x50, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v22, v22, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x54, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v24, v24, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x5c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v25, v25, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v26, v26, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v27, v27, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x68, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v28, v28, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x6c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v29, v29, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x70, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v30, v30, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v60i16_to_v15f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v59 +; SI-NEXT: v_or_b32_e32 v6, v6, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v49 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v56 +; SI-NEXT: v_or_b32_e32 v13, v13, v47 +; SI-NEXT: v_or_b32_e32 v14, v14, v46 +; SI-NEXT: v_or_b32_e32 v15, v15, v38 +; SI-NEXT: v_or_b32_e32 v16, v16, v45 +; SI-NEXT: v_or_b32_e32 v17, v17, v44 +; SI-NEXT: v_or_b32_e32 v19, v19, v42 +; SI-NEXT: v_or_b32_e32 v20, v20, v41 +; SI-NEXT: v_or_b32_e32 v21, v21, v40 +; SI-NEXT: v_or_b32_e32 v22, v22, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_or_b32_e32 v25, v25, v34 +; SI-NEXT: v_or_b32_e32 v26, v26, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v32 +; SI-NEXT: v_or_b32_e32 v28, v28, v63 +; SI-NEXT: v_or_b32_e32 v29, v29, v62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v18, v43, v18 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 +; SI-NEXT: v_or_b32_e32 v6, v50, v6 +; SI-NEXT: v_or_b32_e32 v7, v49, v7 +; SI-NEXT: v_or_b32_e32 v8, v48, v8 +; SI-NEXT: v_or_b32_e32 v9, v58, v9 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v56, v12 +; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v14, v46, v14 +; SI-NEXT: v_or_b32_e32 v15, v38, v15 +; SI-NEXT: v_or_b32_e32 v16, v45, v16 +; SI-NEXT: v_or_b32_e32 v17, v44, v17 +; SI-NEXT: v_or_b32_e32 v19, v42, v19 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v21, v40, v21 +; SI-NEXT: v_or_b32_e32 v22, v37, v22 +; SI-NEXT: v_or_b32_e32 v23, v36, v23 +; SI-NEXT: v_or_b32_e32 v24, v35, v24 +; SI-NEXT: v_or_b32_e32 v25, v34, v25 +; SI-NEXT: v_or_b32_e32 v26, v33, v26 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: v_or_b32_e32 v28, v63, v28 +; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v15f64_to_v60i16: +; VI-LABEL: bitcast_v60i16_to_v15f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v29 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v34, v27 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v36, v25 +; VI-NEXT: v_mov_b32_e32 v37, v24 +; VI-NEXT: v_mov_b32_e32 v38, v23 +; VI-NEXT: v_mov_b32_e32 v39, v22 +; VI-NEXT: v_mov_b32_e32 v48, v21 +; VI-NEXT: v_mov_b32_e32 v49, v20 +; VI-NEXT: v_mov_b32_e32 v50, v19 +; VI-NEXT: v_mov_b32_e32 v51, v18 +; VI-NEXT: v_mov_b32_e32 v52, v17 +; VI-NEXT: v_mov_b32_e32 v53, v16 +; VI-NEXT: v_mov_b32_e32 v54, v15 +; VI-NEXT: v_mov_b32_e32 v55, v14 +; VI-NEXT: v_mov_b32_e32 v40, v13 +; VI-NEXT: v_mov_b32_e32 v41, v12 +; VI-NEXT: v_mov_b32_e32 v42, v11 +; VI-NEXT: v_mov_b32_e32 v43, v10 +; VI-NEXT: v_mov_b32_e32 v44, v9 +; VI-NEXT: v_mov_b32_e32 v45, v8 +; VI-NEXT: v_mov_b32_e32 v46, v7 +; VI-NEXT: v_mov_b32_e32 v47, v6 +; VI-NEXT: v_mov_b32_e32 v56, v5 +; VI-NEXT: v_mov_b32_e32 v57, v4 +; VI-NEXT: v_mov_b32_e32 v58, v3 +; VI-NEXT: v_mov_b32_e32 v59, v2 +; VI-NEXT: v_mov_b32_e32 v60, v1 +; VI-NEXT: v_mov_b32_e32 v61, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v29, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 @@ -18250,184 +40678,268 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: v_mov_b32_e32 v29, 3 +; VI-NEXT: v_add_u16_e32 v0, 3, v61 +; VI-NEXT: v_add_u16_sdwa v1, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 3, v60 +; VI-NEXT: v_add_u16_sdwa v3, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v59 +; VI-NEXT: v_add_u16_sdwa v3, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v58 +; VI-NEXT: v_add_u16_sdwa v4, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v57 +; VI-NEXT: v_add_u16_sdwa v5, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v56 +; VI-NEXT: v_add_u16_sdwa v6, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v47 +; VI-NEXT: v_add_u16_sdwa v7, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u16_e32 v7, 3, v46 +; VI-NEXT: v_add_u16_sdwa v8, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v45 +; VI-NEXT: v_add_u16_sdwa v9, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v44 +; VI-NEXT: v_add_u16_sdwa v10, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v43 +; VI-NEXT: v_add_u16_sdwa v11, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v42 +; VI-NEXT: v_add_u16_sdwa v12, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v41 +; VI-NEXT: v_add_u16_sdwa v13, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v40 +; VI-NEXT: v_add_u16_sdwa v14, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v55 +; VI-NEXT: v_add_u16_sdwa v15, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: v_add_u16_e32 v15, 3, v54 +; VI-NEXT: v_add_u16_sdwa v16, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: v_add_u16_e32 v16, 3, v53 +; VI-NEXT: v_add_u16_sdwa v17, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: v_add_u16_e32 v17, 3, v52 +; VI-NEXT: v_add_u16_sdwa v18, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_add_u16_e32 v18, 3, v51 +; VI-NEXT: v_add_u16_sdwa v19, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: v_add_u16_e32 v19, 3, v50 +; VI-NEXT: v_add_u16_sdwa v20, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_add_u16_e32 v20, 3, v49 +; VI-NEXT: v_add_u16_sdwa v21, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: v_add_u16_e32 v21, 3, v48 +; VI-NEXT: v_add_u16_sdwa v22, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v21, v22 +; VI-NEXT: v_add_u16_e32 v22, 3, v39 +; VI-NEXT: v_add_u16_sdwa v23, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v22, v23 +; VI-NEXT: v_add_u16_e32 v23, 3, v38 +; VI-NEXT: v_add_u16_sdwa v24, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v23, v24 +; VI-NEXT: v_add_u16_e32 v24, 3, v37 +; VI-NEXT: v_add_u16_sdwa v25, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v24, v25 +; VI-NEXT: v_add_u16_e32 v25, 3, v36 +; VI-NEXT: v_add_u16_sdwa v26, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v25, v26 +; VI-NEXT: v_add_u16_e32 v26, 3, v35 +; VI-NEXT: v_add_u16_sdwa v27, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v26, v27 +; VI-NEXT: v_add_u16_e32 v27, 3, v34 +; VI-NEXT: v_add_u16_sdwa v28, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v27, v28 +; VI-NEXT: v_add_u16_e32 v28, 3, v33 +; VI-NEXT: v_add_u16_sdwa v30, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v28, v30 +; VI-NEXT: v_add_u16_e32 v30, 3, v32 +; VI-NEXT: v_add_u16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v30, v29 +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v15f64_to_v60i16: +; GFX9-LABEL: bitcast_v60i16_to_v15f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v48, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: v_mov_b32_e32 v43, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_mov_b32_e32 v44, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v8 +; GFX9-NEXT: v_mov_b32_e32 v46, v7 +; GFX9-NEXT: v_mov_b32_e32 v47, v6 +; GFX9-NEXT: v_mov_b32_e32 v56, v5 +; GFX9-NEXT: v_mov_b32_e32 v57, v4 +; GFX9-NEXT: v_mov_b32_e32 v58, v3 +; GFX9-NEXT: v_mov_b32_e32 v59, v2 +; GFX9-NEXT: v_mov_b32_e32 v60, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB50_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 @@ -18445,151 +40957,237 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: .LBB50_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB50_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 -; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60i16: +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15f64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -18597,873 +41195,2138 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60i16: +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15f64: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <15 x double> %a1 to <60 x i16> + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <15 x double> br label %end cmp.false: - %a3 = bitcast <15 x double> %a to <60 x i16> + %a3 = bitcast <60 x i16> %a to <15 x double> br label %end end: - %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <60 x i16> %phi + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi } -define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v60i16_to_v15f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v34 -; GCN-NEXT: v_or_b32_e32 v1, v1, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v41 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v63 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v62 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v61 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v60 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v59 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v58 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v57 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v47 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v44 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v43 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v56 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v30 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v56 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v41, v4 -; GCN-NEXT: v_or_b32_e32 v5, v63, v5 -; GCN-NEXT: v_or_b32_e32 v6, v62, v6 -; GCN-NEXT: v_or_b32_e32 v7, v61, v7 -; GCN-NEXT: v_or_b32_e32 v8, v60, v8 -; GCN-NEXT: v_or_b32_e32 v9, v59, v9 -; GCN-NEXT: v_or_b32_e32 v10, v58, v10 -; GCN-NEXT: v_or_b32_e32 v11, v57, v11 -; GCN-NEXT: v_or_b32_e32 v12, v47, v12 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v30, v13 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v30, v14 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v30, v15 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v30, v18 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v30, v19 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v30, v20 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v30, v21 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v30, v22 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v30, v23 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v30, v24 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v30, v25 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v30, v26 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v30, v27 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v30, v29 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60i16_to_v15f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v28 +; SI-NEXT: v_mov_b32_e32 v33, v26 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v16 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v12, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v17, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v18, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v19, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_or_b32_e32 v20, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v58 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v28 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_or_b32_e32 v22, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v23, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v24, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v25, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v26, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v27, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v28, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v29, v0, v63 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v44 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v58 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v57, v56 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v62, v60 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: v_mov_b32_e32 v31, v61 +; SI-NEXT: v_mov_b32_e32 v61, v63 +; SI-NEXT: v_mov_b32_e32 v63, v60 +; SI-NEXT: v_mov_b32_e32 v60, v62 +; SI-NEXT: v_mov_b32_e32 v62, v47 +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: v_mov_b32_e32 v56, v57 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v58, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: s_branch .LBB51_2 ; -; VI-LABEL: bitcast_v60i16_to_v15f64: +; VI-LABEL: bitcast_v60i16_to_v15f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s43, s29, 16 +; VI-NEXT: s_lshr_b32 s42, s28, 16 +; VI-NEXT: s_lshr_b32 s41, s27, 16 +; VI-NEXT: s_lshr_b32 s40, s26, 16 +; VI-NEXT: s_lshr_b32 s15, s25, 16 +; VI-NEXT: s_lshr_b32 s14, s24, 16 +; VI-NEXT: s_lshr_b32 s13, s23, 16 +; VI-NEXT: s_lshr_b32 s12, s22, 16 +; VI-NEXT: s_lshr_b32 s11, s21, 16 +; VI-NEXT: s_lshr_b32 s10, s20, 16 +; VI-NEXT: s_lshr_b32 s9, s19, 16 +; VI-NEXT: s_lshr_b32 s8, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s17, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v39, v15 +; VI-NEXT: v_mov_b32_e32 v37, v14 +; VI-NEXT: v_mov_b32_e32 v35, v13 +; VI-NEXT: v_mov_b32_e32 v34, v12 +; VI-NEXT: v_mov_b32_e32 v33, v11 +; VI-NEXT: v_mov_b32_e32 v32, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v38, v8 +; VI-NEXT: v_mov_b32_e32 v48, v7 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v52, v3 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v55, v1 +; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s7, 16 +; VI-NEXT: v_or_b32_sdwa v14, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s8, 16 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s10, 16 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s11, 16 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s12, 16 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s13, 16 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s14, 16 +; VI-NEXT: v_or_b32_sdwa v22, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s15, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s40, 16 +; VI-NEXT: v_or_b32_sdwa v24, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s41, 16 +; VI-NEXT: v_or_b32_sdwa v25, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s42, 16 +; VI-NEXT: v_or_b32_sdwa v26, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v54 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v48 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xffff +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v38 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s16 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xffff +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v36 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s9, s9, s16 +; VI-NEXT: s_and_b32 s16, s20, 0xffff +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s10, s10, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v32 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s11, s11, s16 +; VI-NEXT: s_and_b32 s16, s22, 0xffff +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v33 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s12, s12, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v34 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s13, s13, s16 +; VI-NEXT: s_and_b32 s16, s24, 0xffff +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v35 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_lshl_b32 s5, s42, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xffff +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v37 +; VI-NEXT: s_lshl_b32 s4, s43, 16 +; VI-NEXT: s_lshl_b32 s41, s41, 16 +; VI-NEXT: s_lshl_b32 s40, s40, 16 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s16, s26, 0xffff +; VI-NEXT: s_and_b32 s17, s27, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: s_or_b32 s16, s40, s16 +; VI-NEXT: s_or_b32 s17, s41, s17 +; VI-NEXT: s_or_b32 s4, s4, s18 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v8, s14 +; VI-NEXT: v_mov_b32_e32 v9, s15 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_mov_b32_e32 v11, s17 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v13, s4 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v60i16_to_v15f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v15 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v12 +; GFX9-NEXT: v_mov_b32_e32 v36, v11 +; GFX9-NEXT: v_mov_b32_e32 v37, v10 +; GFX9-NEXT: v_mov_b32_e32 v38, v9 +; GFX9-NEXT: v_mov_b32_e32 v39, v8 +; GFX9-NEXT: v_mov_b32_e32 v48, v7 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v50, v5 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v52, v3 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v54, v1 +; GFX9-NEXT: v_mov_b32_e32 v55, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v63, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v33, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v9 :: v_dual_mov_b32 v35, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v7 :: v_dual_mov_b32 v37, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v5 :: v_dual_mov_b32 v39, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v3 :: v_dual_mov_b32 v49, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v1 :: v_dual_mov_b32 v51, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v28, 16, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v34, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <15 x double> + br label %end + +cmp.false: + %a3 = bitcast <60 x i16> %a to <15 x double> + br label %end + +end: + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi +} + +define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v15f64_to_v60f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v55, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v60 +; SI-NEXT: v_mov_b32_e32 v60, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v62 +; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v59 +; SI-NEXT: v_mov_b32_e32 v59, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v61 +; SI-NEXT: v_mov_b32_e32 v61, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v63 +; SI-NEXT: v_mov_b32_e32 v63, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v38 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v48 +; SI-NEXT: v_mov_b32_e32 v48, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[32:33], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; SI-NEXT: v_add_f64 v[49:50], v[3:4], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15f64_to_v60f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v29 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v34, v27 -; VI-NEXT: v_mov_b32_e32 v35, v26 -; VI-NEXT: v_mov_b32_e32 v36, v25 -; VI-NEXT: v_mov_b32_e32 v37, v24 -; VI-NEXT: v_mov_b32_e32 v38, v23 -; VI-NEXT: v_mov_b32_e32 v39, v22 -; VI-NEXT: v_mov_b32_e32 v48, v21 -; VI-NEXT: v_mov_b32_e32 v49, v20 -; VI-NEXT: v_mov_b32_e32 v50, v19 -; VI-NEXT: v_mov_b32_e32 v51, v18 -; VI-NEXT: v_mov_b32_e32 v52, v17 -; VI-NEXT: v_mov_b32_e32 v53, v16 -; VI-NEXT: v_mov_b32_e32 v54, v15 -; VI-NEXT: v_mov_b32_e32 v55, v14 -; VI-NEXT: v_mov_b32_e32 v40, v13 -; VI-NEXT: v_mov_b32_e32 v41, v12 -; VI-NEXT: v_mov_b32_e32 v42, v11 -; VI-NEXT: v_mov_b32_e32 v43, v10 -; VI-NEXT: v_mov_b32_e32 v44, v9 -; VI-NEXT: v_mov_b32_e32 v45, v8 -; VI-NEXT: v_mov_b32_e32 v46, v7 -; VI-NEXT: v_mov_b32_e32 v47, v6 -; VI-NEXT: v_mov_b32_e32 v56, v5 -; VI-NEXT: v_mov_b32_e32 v57, v4 -; VI-NEXT: v_mov_b32_e32 v58, v3 -; VI-NEXT: v_mov_b32_e32 v59, v2 -; VI-NEXT: v_mov_b32_e32 v60, v1 -; VI-NEXT: v_mov_b32_e32 v61, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v29, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 @@ -19492,268 +43355,184 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB52_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB52_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB52_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v29, 3 -; VI-NEXT: v_add_u16_e32 v0, 3, v61 -; VI-NEXT: v_add_u16_sdwa v1, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v60 -; VI-NEXT: v_add_u16_sdwa v3, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: v_add_u16_e32 v2, 3, v59 -; VI-NEXT: v_add_u16_sdwa v3, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v3, 3, v58 -; VI-NEXT: v_add_u16_sdwa v4, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_add_u16_e32 v4, 3, v57 -; VI-NEXT: v_add_u16_sdwa v5, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_add_u16_e32 v5, 3, v56 -; VI-NEXT: v_add_u16_sdwa v6, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v5, v6 -; VI-NEXT: v_add_u16_e32 v6, 3, v47 -; VI-NEXT: v_add_u16_sdwa v7, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_add_u16_e32 v7, 3, v46 -; VI-NEXT: v_add_u16_sdwa v8, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v45 -; VI-NEXT: v_add_u16_sdwa v9, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v44 -; VI-NEXT: v_add_u16_sdwa v10, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v9, v10 -; VI-NEXT: v_add_u16_e32 v10, 3, v43 -; VI-NEXT: v_add_u16_sdwa v11, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v42 -; VI-NEXT: v_add_u16_sdwa v12, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v41 -; VI-NEXT: v_add_u16_sdwa v13, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v12, v13 -; VI-NEXT: v_add_u16_e32 v13, 3, v40 -; VI-NEXT: v_add_u16_sdwa v14, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v55 -; VI-NEXT: v_add_u16_sdwa v15, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: v_add_u16_e32 v15, 3, v54 -; VI-NEXT: v_add_u16_sdwa v16, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: v_add_u16_e32 v16, 3, v53 -; VI-NEXT: v_add_u16_sdwa v17, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v16, v16, v17 -; VI-NEXT: v_add_u16_e32 v17, 3, v52 -; VI-NEXT: v_add_u16_sdwa v18, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v17, v17, v18 -; VI-NEXT: v_add_u16_e32 v18, 3, v51 -; VI-NEXT: v_add_u16_sdwa v19, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v18, v18, v19 -; VI-NEXT: v_add_u16_e32 v19, 3, v50 -; VI-NEXT: v_add_u16_sdwa v20, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: v_add_u16_e32 v20, 3, v49 -; VI-NEXT: v_add_u16_sdwa v21, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v20, v20, v21 -; VI-NEXT: v_add_u16_e32 v21, 3, v48 -; VI-NEXT: v_add_u16_sdwa v22, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: v_add_u16_e32 v22, 3, v39 -; VI-NEXT: v_add_u16_sdwa v23, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v22, v22, v23 -; VI-NEXT: v_add_u16_e32 v23, 3, v38 -; VI-NEXT: v_add_u16_sdwa v24, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v23, v23, v24 -; VI-NEXT: v_add_u16_e32 v24, 3, v37 -; VI-NEXT: v_add_u16_sdwa v25, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: v_add_u16_e32 v25, 3, v36 -; VI-NEXT: v_add_u16_sdwa v26, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v25, v25, v26 -; VI-NEXT: v_add_u16_e32 v26, 3, v35 -; VI-NEXT: v_add_u16_sdwa v27, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v26, v26, v27 -; VI-NEXT: v_add_u16_e32 v27, 3, v34 -; VI-NEXT: v_add_u16_sdwa v28, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v27, v27, v28 -; VI-NEXT: v_add_u16_e32 v28, 3, v33 -; VI-NEXT: v_add_u16_sdwa v30, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v28, v28, v30 -; VI-NEXT: v_add_u16_e32 v30, 3, v32 -; VI-NEXT: v_add_u16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v29, v30, v29 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB52_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v60i16_to_v15f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v39, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v48, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: v_mov_b32_e32 v43, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_mov_b32_e32 v44, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v8 -; GFX9-NEXT: v_mov_b32_e32 v46, v7 -; GFX9-NEXT: v_mov_b32_e32 v47, v6 -; GFX9-NEXT: v_mov_b32_e32 v56, v5 -; GFX9-NEXT: v_mov_b32_e32 v57, v4 -; GFX9-NEXT: v_mov_b32_e32 v58, v3 -; GFX9-NEXT: v_mov_b32_e32 v59, v2 -; GFX9-NEXT: v_mov_b32_e32 v60, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v15f64_to_v60f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 @@ -19775,233 +43554,147 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: .LBB25_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB52_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB52_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB52_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB52_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 +; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15f64: +; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -20009,845 +43702,773 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15f64: +; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB52_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <60 x i16> %a, splat (i16 3) - %a2 = bitcast <60 x i16> %a1 to <15 x double> + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <60 x half> br label %end cmp.false: - %a3 = bitcast <60 x i16> %a to <15 x double> + %a3 = bitcast <15 x double> %a to <60 x half> br label %end end: - %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <15 x double> %phi + %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x half> %phi } -define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v15f64_to_v60f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v58 -; GCN-NEXT: v_mov_b32_e32 v58, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v60 -; GCN-NEXT: v_mov_b32_e32 v60, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v62 -; GCN-NEXT: v_mov_b32_e32 v62, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v63 -; GCN-NEXT: v_mov_b32_e32 v63, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v19 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v44 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v43 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v42 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v41 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v54 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v53 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v51 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v49 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v37 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v35 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v35, v30 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v33, v37, v33 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v38, v32 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v39, v36 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v48, v34 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v49, v31 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v63 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v58 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v60 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v62 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v47, v46 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v56, v57, v56 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v58, v59, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15f64_to_v60f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_readfirstlane_b32 s42, v1 +; SI-NEXT: v_readfirstlane_b32 s43, v2 +; SI-NEXT: v_readfirstlane_b32 s40, v3 +; SI-NEXT: v_readfirstlane_b32 s41, v4 +; SI-NEXT: v_readfirstlane_b32 s14, v5 +; SI-NEXT: v_readfirstlane_b32 s15, v6 +; SI-NEXT: v_readfirstlane_b32 s12, v7 +; SI-NEXT: v_readfirstlane_b32 s13, v8 +; SI-NEXT: v_readfirstlane_b32 s10, v9 +; SI-NEXT: v_readfirstlane_b32 s11, v10 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: v_readfirstlane_b32 s9, v12 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: v_readfirstlane_b32 s7, v14 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: s_and_b64 s[44:45], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s44, s5, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s44 +; SI-NEXT: s_lshr_b32 s44, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s44 +; SI-NEXT: s_lshr_b32 s44, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s44 +; SI-NEXT: s_lshr_b32 s44, s6, 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, s44 +; SI-NEXT: s_lshr_b32 s44, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 +; SI-NEXT: s_lshr_b32 s44, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 +; SI-NEXT: s_lshr_b32 s44, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s44 +; SI-NEXT: s_lshr_b32 s44, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s44 +; SI-NEXT: s_lshr_b32 s44, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s44 +; SI-NEXT: s_lshr_b32 s44, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s44 +; SI-NEXT: s_lshr_b32 s44, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s44 +; SI-NEXT: s_lshr_b32 s44, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s44 +; SI-NEXT: s_lshr_b32 s44, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s44 +; SI-NEXT: s_lshr_b32 s44, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s44 +; SI-NEXT: s_lshr_b32 s44, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s44 +; SI-NEXT: s_lshr_b32 s44, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s44 +; SI-NEXT: s_lshr_b32 s44, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s44 +; SI-NEXT: s_lshr_b32 s44, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s44 +; SI-NEXT: s_lshr_b32 s44, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s44 +; SI-NEXT: s_lshr_b32 s44, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s44 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s44 +; SI-NEXT: s_lshr_b32 s44, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s44 +; SI-NEXT: s_lshr_b32 s44, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s44 +; SI-NEXT: s_lshr_b32 s44, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s44 +; SI-NEXT: s_lshr_b32 s44, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s44 +; SI-NEXT: s_lshr_b32 s44, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s44 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s44 +; SI-NEXT: s_lshr_b32 s44, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s44 +; SI-NEXT: s_lshr_b32 s44, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s44 +; SI-NEXT: s_lshr_b32 s44, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s10 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, s13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_add_f64 v[57:58], s[18:19], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_add_f64 v[41:42], s[20:21], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v58 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v41 +; SI-NEXT: v_add_f64 v[53:54], s[22:23], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v42 +; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_add_f64 v[20:21], s[12:13], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_f64 v[37:38], s[26:27], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 +; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[34:35], s[28:29], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_add_f64 v[30:31], s[42:43], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v44 +; SI-NEXT: v_add_f64 v[49:50], s[24:25], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v56 +; SI-NEXT: v_add_f64 v[26:27], s[40:41], 1.0 +; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 +; SI-NEXT: v_mov_b32_e32 v59, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 +; SI-NEXT: v_mov_b32_e32 v13, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v6 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v58 +; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v6, v56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 +; SI-NEXT: v_add_i32_e32 v10, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 +; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 +; SI-NEXT: v_add_i32_e32 v10, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v42 +; SI-NEXT: v_add_i32_e32 v10, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v54 +; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v16 +; SI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v50 +; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 +; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 +; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 +; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_branch .LBB53_2 ; -; VI-LABEL: bitcast_v15f64_to_v60f16: +; VI-LABEL: bitcast_v15f64_to_v60f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v17, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v29, s18 +; VI-NEXT: v_mov_b32_e32 v30, s19 +; VI-NEXT: v_mov_b32_e32 v27, s20 +; VI-NEXT: v_mov_b32_e32 v28, s21 +; VI-NEXT: v_mov_b32_e32 v25, s22 +; VI-NEXT: v_mov_b32_e32 v26, s23 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s25 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v20, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -20860,145 +44481,141 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB26_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; VI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; VI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB26_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v17, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 +; VI-NEXT: v_or_b32_sdwa v38, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 +; VI-NEXT: v_or_b32_sdwa v39, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 +; VI-NEXT: v_or_b32_sdwa v48, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v49 +; VI-NEXT: v_or_b32_sdwa v49, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v50 +; VI-NEXT: v_or_b32_sdwa v50, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 +; VI-NEXT: v_or_b32_sdwa v51, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 +; VI-NEXT: v_or_b32_sdwa v52, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; VI-NEXT: v_or_b32_sdwa v53, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v59 +; VI-NEXT: v_or_b32_sdwa v30, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -21011,49 +44628,81 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v28, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v29, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v32 +; VI-NEXT: v_mov_b32_e32 v1, v33 +; VI-NEXT: v_mov_b32_e32 v2, v34 +; VI-NEXT: v_mov_b32_e32 v3, v35 +; VI-NEXT: v_mov_b32_e32 v4, v36 +; VI-NEXT: v_mov_b32_e32 v5, v37 +; VI-NEXT: v_mov_b32_e32 v6, v38 +; VI-NEXT: v_mov_b32_e32 v7, v39 +; VI-NEXT: v_mov_b32_e32 v8, v48 +; VI-NEXT: v_mov_b32_e32 v9, v49 +; VI-NEXT: v_mov_b32_e32 v10, v50 +; VI-NEXT: v_mov_b32_e32 v11, v51 +; VI-NEXT: v_mov_b32_e32 v12, v52 +; VI-NEXT: v_mov_b32_e32 v13, v53 +; VI-NEXT: v_mov_b32_e32 v14, v30 +; VI-NEXT: v_mov_b32_e32 v15, v31 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_branch .LBB53_2 ; -; GFX9-LABEL: bitcast_v15f64_to_v60f16: +; GFX9-LABEL: bitcast_v15f64_to_v60f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v17, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 +; GFX9-NEXT: v_mov_b32_e32 v29, s18 +; GFX9-NEXT: v_mov_b32_e32 v30, s19 +; GFX9-NEXT: v_mov_b32_e32 v27, s20 +; GFX9-NEXT: v_mov_b32_e32 v28, s21 +; GFX9-NEXT: v_mov_b32_e32 v25, s22 +; GFX9-NEXT: v_mov_b32_e32 v26, s23 +; GFX9-NEXT: v_mov_b32_e32 v23, s24 +; GFX9-NEXT: v_mov_b32_e32 v24, s25 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v22, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v20, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -21066,81 +44715,40 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB26_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -21149,51 +44757,100 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX9-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX9-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB26_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v30 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v50, v50, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v30, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v51, v51, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v52, v52, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v40, 16, v0 ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -21206,62 +44863,438 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 -; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v28, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v33 +; GFX9-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-NEXT: v_mov_b32_e32 v3, v35 +; GFX9-NEXT: v_mov_b32_e32 v4, v36 +; GFX9-NEXT: v_mov_b32_e32 v5, v37 +; GFX9-NEXT: v_mov_b32_e32 v6, v38 +; GFX9-NEXT: v_mov_b32_e32 v7, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v48 +; GFX9-NEXT: v_mov_b32_e32 v9, v49 +; GFX9-NEXT: v_mov_b32_e32 v10, v50 +; GFX9-NEXT: v_mov_b32_e32 v11, v51 +; GFX9-NEXT: v_mov_b32_e32 v12, v52 +; GFX9-NEXT: v_mov_b32_e32 v13, v53 +; GFX9-NEXT: v_mov_b32_e32 v14, v30 +; GFX9-NEXT: v_mov_b32_e32 v15, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_branch .LBB53_2 ; -; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60f16: +; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60f16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, s0 :: v_dual_mov_b32 v30, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, s2 :: v_dual_mov_b32 v28, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, s16 :: v_dual_mov_b32 v26, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, s18 :: v_dual_mov_b32 v24, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, s20 :: v_dual_mov_b32 v22, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, s22 :: v_dual_mov_b32 v20, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v18, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX11-TRUE16-NEXT: .LBB53_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v35, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v34, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v12, 16, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v81, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v80, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v70, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v48, v48, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v71, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_and_b32 v1, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v83, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v49, v49, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v49 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v36, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v38, v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v39, v39, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v0, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v36 :: v_dual_mov_b32 v1, v37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v9, v33 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB53_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB53_2 ; -; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60f16: +; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60f16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v31, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v29, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v27, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v23, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-FAKE16-NEXT: .LBB53_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v39, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v82, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v49, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v35 :: v_dual_and_b32 v4, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v81, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v70, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v34 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB53_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 @@ -21278,136 +45311,7 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: s_branch .LBB53_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -21426,756 +45330,757 @@ end: } define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v60f16_to_v15f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v46 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v47 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v44 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v45 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v60 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; GCN-NEXT: v_or_b32_e32 v0, v58, v0 -; GCN-NEXT: v_or_b32_e32 v1, v56, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v50 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: v_or_b32_e32 v18, v46, v18 -; GCN-NEXT: v_or_b32_e32 v19, v44, v19 -; GCN-NEXT: v_or_b32_e32 v20, v43, v20 -; GCN-NEXT: v_or_b32_e32 v21, v41, v21 -; GCN-NEXT: v_or_b32_e32 v22, v48, v22 -; GCN-NEXT: v_or_b32_e32 v23, v38, v23 -; GCN-NEXT: v_or_b32_e32 v24, v36, v24 -; GCN-NEXT: v_or_b32_e32 v25, v35, v25 -; GCN-NEXT: v_or_b32_e32 v26, v37, v26 -; GCN-NEXT: v_or_b32_e32 v27, v39, v27 -; GCN-NEXT: v_or_b32_e32 v28, v49, v28 -; GCN-NEXT: v_or_b32_e32 v29, v51, v29 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v56 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v46 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v44 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_or_b32_e32 v17, v19, v18 -; GCN-NEXT: v_or_b32_e32 v18, v21, v20 -; GCN-NEXT: v_or_b32_e32 v19, v23, v22 -; GCN-NEXT: v_or_b32_e32 v20, v25, v24 -; GCN-NEXT: v_or_b32_e32 v21, v27, v26 -; GCN-NEXT: v_or_b32_e32 v22, v29, v28 -; GCN-NEXT: v_or_b32_e32 v23, v31, v30 -; GCN-NEXT: v_or_b32_e32 v24, v33, v32 -; GCN-NEXT: v_or_b32_e32 v25, v35, v34 -; GCN-NEXT: v_or_b32_e32 v26, v37, v36 -; GCN-NEXT: v_or_b32_e32 v27, v39, v38 -; GCN-NEXT: v_or_b32_e32 v28, v49, v48 -; GCN-NEXT: v_or_b32_e32 v29, v51, v50 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v60f16_to_v15f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v23, v52, v23 +; SI-NEXT: v_or_b32_e32 v24, v50, v24 +; SI-NEXT: v_or_b32_e32 v25, v48, v25 +; SI-NEXT: v_or_b32_e32 v26, v38, v26 +; SI-NEXT: v_or_b32_e32 v27, v36, v27 +; SI-NEXT: v_or_b32_e32 v28, v34, v28 +; SI-NEXT: v_or_b32_e32 v29, v32, v29 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 +; SI-NEXT: v_or_b32_e32 v22, v62, v22 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v63 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60f16_to_v15f64: ; VI: ; %bb.0: @@ -22228,7 +46133,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v29, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -22321,9 +46226,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: .LBB54_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_4 +; VI-NEXT: s_cbranch_execz .LBB54_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v29, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -22416,7 +46321,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v29, v30, v29 -; VI-NEXT: .LBB27_4: ; %end +; VI-NEXT: .LBB54_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -22539,7 +46444,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -22695,9 +46600,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: .LBB54_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload @@ -22810,7 +46715,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 ; GFX9-NEXT: v_pk_add_f16 v29, v29, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_4: ; %end +; GFX9-NEXT: .LBB54_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -22839,7 +46744,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -22871,7 +46776,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: .LBB54_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -22943,7 +46848,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -22975,9 +46880,1484 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB27_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB54_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <60 x half> %a, splat (half 0xH0200) + %a2 = bitcast <60 x half> %a1 to <15 x double> + br label %end + +cmp.false: + %a3 = bitcast <60 x half> %a to <15 x double> + br label %end + +end: + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi +} + +define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60f16_to_v15f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB55_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_or_b32_e32 v10, v32, v10 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v13, v43, v13 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_mov_b32_e32 v57, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v47, v36 +; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_or_b32_e32 v11, v35, v11 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_or_b32_e32 v14, v55, v14 +; SI-NEXT: v_or_b32_e32 v15, v61, v15 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v21, v51, v21 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v17, v32, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_or_b32_e32 v9, v39, v9 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v54, v29 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: s_branch .LBB55_3 +; SI-NEXT: .LBB55_2: +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v47, v36 +; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v30, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v39 +; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v31, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: .LBB55_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v61, v40 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: s_cbranch_vccnz .LBB55_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_mov_b32_e32 v55, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: .LBB55_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v60f16_to_v15f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v34, v13 +; VI-NEXT: v_mov_b32_e32 v35, v12 +; VI-NEXT: v_mov_b32_e32 v36, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v38, v9 +; VI-NEXT: v_mov_b32_e32 v39, v8 +; VI-NEXT: v_mov_b32_e32 v48, v7 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v52, v3 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v54, v1 +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v29, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v55 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v54 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v26, v27, v26 +; VI-NEXT: v_add_f16_sdwa v27, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: v_add_f16_sdwa v28, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v28, v30, v28 +; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v29, v30, v29 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v60f16_to_v15f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v15 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v12 +; GFX9-NEXT: v_mov_b32_e32 v36, v11 +; GFX9-NEXT: v_mov_b32_e32 v37, v10 +; GFX9-NEXT: v_mov_b32_e32 v38, v9 +; GFX9-NEXT: v_mov_b32_e32 v39, v8 +; GFX9-NEXT: v_mov_b32_e32 v48, v7 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v50, v5 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v52, v3 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v54, v1 +; GFX9-NEXT: v_mov_b32_e32 v55, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v33, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v9 :: v_dual_mov_b32 v35, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v7 :: v_dual_mov_b32 v37, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v5 :: v_dual_mov_b32 v39, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v3 :: v_dual_mov_b32 v49, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v1 :: v_dual_mov_b32 v51, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v28, 16, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v34, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB55_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB55_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB55_2 +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB55_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB55_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB55_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -22996,974 +48376,996 @@ end: } define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v60i16_to_v60f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v40 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v39 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v40 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v39 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v56 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 12, v0 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 20, v0 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v47, v46 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v56, v57, v56 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v58, v59, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v60i16_to_v60f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v47, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v60f16: ; VI: ; %bb.0: @@ -24014,7 +49416,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v30, 3, v30 @@ -24076,7 +49478,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v32, 3, v32 ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: v_add_u16_e32 v31, 3, v31 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; VI-NEXT: v_or_b32_sdwa v0, v0, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -24202,7 +49604,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v29, v59, v29, s6 @@ -24295,7 +49697,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v28 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v29 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v18, v40, v18, s4 @@ -24351,7 +49753,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] @@ -24383,7 +49785,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: .LBB56_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -24425,7 +49827,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v29, v84, v29, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v28, v83, v28, 0x5040100 @@ -24482,73 +49884,2020 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v29 +; GFX11-FAKE16-NEXT: .LBB56_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v31, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v32, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v34, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v36, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v38, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v48, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v50, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v52, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v54, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v55, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v64, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v65, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v67, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v68, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v69, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v71, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v80, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v81, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v82, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v83, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v84, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <60 x half> + br label %end + +cmp.false: + %a3 = bitcast <60 x i16> %a to <60 x half> + br label %end + +end: + %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x half> %phi +} + +define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60i16_to_v60f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB57_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s16 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v48 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v36 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v34 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s25 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s29 +; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v50 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v27 +; SI-NEXT: s_branch .LBB57_3 +; SI-NEXT: .LBB57_2: +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: .LBB57_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v52 +; SI-NEXT: v_mov_b32_e32 v52, v54 +; SI-NEXT: v_mov_b32_e32 v54, v40 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v42, v44 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: v_mov_b32_e32 v46, v56 +; SI-NEXT: v_mov_b32_e32 v56, v31 +; SI-NEXT: s_cbranch_vccnz .LBB57_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v35 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s18 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s20 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s21 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s24 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s25 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v34 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: .LBB57_5: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v60i16_to_v60f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_or_b32_sdwa v30, v0, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: s_or_b32 s15, s18, s15 +; VI-NEXT: s_and_b32 s18, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: s_or_b32 s14, s18, s14 +; VI-NEXT: s_and_b32 s18, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; VI-NEXT: s_or_b32 s13, s18, s13 +; VI-NEXT: s_and_b32 s18, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: s_or_b32 s12, s18, s12 +; VI-NEXT: s_and_b32 s18, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: s_or_b32 s11, s18, s11 +; VI-NEXT: s_and_b32 s18, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: s_or_b32 s10, s18, s10 +; VI-NEXT: s_and_b32 s18, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; VI-NEXT: s_or_b32 s9, s18, s9 +; VI-NEXT: s_and_b32 s18, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; VI-NEXT: s_or_b32 s8, s18, s8 +; VI-NEXT: s_and_b32 s18, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: s_or_b32 s7, s18, s7 +; VI-NEXT: s_and_b32 s18, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; VI-NEXT: s_or_b32 s6, s18, s6 +; VI-NEXT: v_or_b32_sdwa v28, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: v_mov_b32_e32 v14, v30 +; VI-NEXT: v_mov_b32_e32 v15, v31 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v60i16_to_v60f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_pk_add_u16 v30, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_u16 v31, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_u16 v51, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_u16 v50, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_u16 v49, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_u16 v48, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_u16 v39, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_u16 v38, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_u16 v37, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_u16 v36, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_u16 v35, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pk_add_u16 v34, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_lshl_or_b32 v15, v29, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v14, v28, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v13, v27, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v12, v26, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v11, v55, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v10, v54, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v25, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v8, v24, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v22, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v21, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v19, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v18, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v16, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v33, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v32, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: s_branch .LBB57_5 +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v30, s29 +; GFX9-NEXT: v_mov_b32_e32 v31, s28 +; GFX9-NEXT: v_mov_b32_e32 v51, s27 +; GFX9-NEXT: v_mov_b32_e32 v50, s26 +; GFX9-NEXT: v_mov_b32_e32 v49, s25 +; GFX9-NEXT: v_mov_b32_e32 v48, s24 +; GFX9-NEXT: v_mov_b32_e32 v39, s23 +; GFX9-NEXT: v_mov_b32_e32 v38, s22 +; GFX9-NEXT: v_mov_b32_e32 v37, s21 +; GFX9-NEXT: v_mov_b32_e32 v36, s20 +; GFX9-NEXT: v_mov_b32_e32 v35, s19 +; GFX9-NEXT: v_mov_b32_e32 v34, s18 +; GFX9-NEXT: v_mov_b32_e32 v33, s17 +; GFX9-NEXT: v_mov_b32_e32 v32, s16 +; GFX9-NEXT: v_mov_b32_e32 v53, s43 +; GFX9-NEXT: v_mov_b32_e32 v52, s42 +; GFX9-NEXT: v_mov_b32_e32 v40, s41 +; GFX9-NEXT: v_mov_b32_e32 v41, s40 +; GFX9-NEXT: v_mov_b32_e32 v42, s15 +; GFX9-NEXT: v_mov_b32_e32 v43, s14 +; GFX9-NEXT: v_mov_b32_e32 v44, s13 +; GFX9-NEXT: v_mov_b32_e32 v45, s12 +; GFX9-NEXT: v_mov_b32_e32 v46, s11 +; GFX9-NEXT: v_mov_b32_e32 v47, s10 +; GFX9-NEXT: v_mov_b32_e32 v56, s9 +; GFX9-NEXT: v_mov_b32_e32 v57, s8 +; GFX9-NEXT: v_mov_b32_e32 v58, s7 +; GFX9-NEXT: v_mov_b32_e32 v59, s6 +; GFX9-NEXT: .LBB57_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v51, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v32, v59, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v33, v58, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v34, v57, 16, v34 +; GFX9-NEXT: v_lshl_or_b32 v35, v56, 16, v35 +; GFX9-NEXT: v_lshl_or_b32 v36, v47, 16, v36 +; GFX9-NEXT: v_lshl_or_b32 v37, v46, 16, v37 +; GFX9-NEXT: v_lshl_or_b32 v38, v45, 16, v38 +; GFX9-NEXT: v_lshl_or_b32 v39, v44, 16, v39 +; GFX9-NEXT: v_lshl_or_b32 v48, v43, 16, v48 +; GFX9-NEXT: v_lshl_or_b32 v49, v42, 16, v49 +; GFX9-NEXT: v_lshl_or_b32 v50, v41, 16, v50 +; GFX9-NEXT: v_lshl_or_b32 v51, v40, 16, v51 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v30, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v52, v52, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v31, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v21, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v22, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v25, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v33 +; GFX9-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-NEXT: v_mov_b32_e32 v3, v35 +; GFX9-NEXT: v_mov_b32_e32 v4, v36 +; GFX9-NEXT: v_mov_b32_e32 v5, v37 +; GFX9-NEXT: v_mov_b32_e32 v6, v38 +; GFX9-NEXT: v_mov_b32_e32 v7, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v48 +; GFX9-NEXT: v_mov_b32_e32 v9, v49 +; GFX9-NEXT: v_mov_b32_e32 v10, v50 +; GFX9-NEXT: v_mov_b32_e32 v11, v51 +; GFX9-NEXT: v_mov_b32_e32 v12, v52 +; GFX9-NEXT: v_mov_b32_e32 v13, v53 +; GFX9-NEXT: v_mov_b32_e32 v14, v30 +; GFX9-NEXT: v_mov_b32_e32 v15, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v60f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v14, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v12, 16, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v13, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v14, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v12, 16, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v14, 16, v1 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v13, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v15, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v12, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s15, s11 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v12, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v13, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v15, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s14, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s11, s9 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s10, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s9, s7 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v37, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v36, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v65.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v67.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v68.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v69.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v71.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v80.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v81.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v83.l +; GFX11-TRUE16-NEXT: s_branch .LBB57_5 +; GFX11-TRUE16-NEXT: .LBB57_3: +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s13 +; GFX11-TRUE16-NEXT: .LBB57_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v38, 16, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v39, 16, v64 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v39, v49, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v35, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v29, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v49, v33, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v38, v48, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v54, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v48, v55, 16, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v50, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v52, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v34, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v28, 16, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v27, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v19, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_and_b32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v53, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.h +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v51, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v36 :: v_dual_mov_b32 v1, v37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_mov_b32 v8, v32 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v33 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, v35 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v60f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v28, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v29 -; GFX11-FAKE16-NEXT: .LBB28_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v31, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v32, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v34, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v36, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v38, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v48, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v50, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v52, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v54, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v55, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v64, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v65, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v67, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v68, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v69, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v71, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v80, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v81, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v82, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v83, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v84, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v34, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v35, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v30, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v31, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v33, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v49, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v48, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v39, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v38, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v37, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v36, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: s_branch .LBB57_5 +; GFX11-FAKE16-NEXT: .LBB57_3: +; GFX11-FAKE16-NEXT: s_branch .LBB57_2 +; GFX11-FAKE16-NEXT: .LBB57_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v33, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v49, s17 :: v_dual_mov_b32 v36, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v38, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v39, s1 :: v_dual_mov_b32 v48, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s45 :: v_dual_mov_b32 v51, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s43 :: v_dual_mov_b32 v53, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s41 :: v_dual_mov_b32 v55, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s15 :: v_dual_mov_b32 v65, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s13 :: v_dual_mov_b32 v67, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s11 :: v_dual_mov_b32 v69, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s9 :: v_dual_mov_b32 v71, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s6 :: v_dual_mov_b32 v81, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s5 +; GFX11-FAKE16-NEXT: .LBB57_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v70, 16, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v84, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v82, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v80, 16, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v22, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v81, 16, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v83, 16, v48 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v71, 16, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v69, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v68, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v66, 16, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v67, 16, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v65, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v64, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v53, 16, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v35 :: v_dual_and_b32 v4, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v52, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v34 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -24568,611 +51917,633 @@ end: } define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v60f16_to_v60i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v13 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v14 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v17 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v22 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v28 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v30 -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v51 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v61 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v50 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v61, v27 -; GCN-NEXT: v_mov_b32_e32 v63, v2 -; GCN-NEXT: v_mov_b32_e32 v45, v1 -; GCN-NEXT: v_mov_b32_e32 v56, v8 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v52 -; GCN-NEXT: v_or_b32_e32 v48, v48, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v54 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4 -; GCN-NEXT: v_or_b32_e32 v54, v50, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_or_b32_e32 v5, v5, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v14 -; GCN-NEXT: v_or_b32_e32 v13, v13, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v18 -; GCN-NEXT: v_or_b32_e32 v17, v17, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v20 -; GCN-NEXT: v_or_b32_e32 v19, v19, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v22 -; GCN-NEXT: v_or_b32_e32 v21, v21, v50 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v1 -; GCN-NEXT: v_or_b32_e32 v7, v7, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v12 -; GCN-NEXT: v_or_b32_e32 v11, v11, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v16 -; GCN-NEXT: v_or_b32_e32 v15, v15, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v32 -; GCN-NEXT: v_or_b32_e32 v31, v31, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v34 -; GCN-NEXT: v_or_b32_e32 v33, v33, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v36 -; GCN-NEXT: v_or_b32_e32 v35, v35, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v38 -; GCN-NEXT: v_or_b32_e32 v37, v37, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v60 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v59 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; GCN-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; GCN-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; GCN-NEXT: v_add_f32_e32 v57, 0x38000000, v57 -; GCN-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; GCN-NEXT: v_add_f32_e32 v59, 0x38000000, v59 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v60, 0x38000000, v60 -; GCN-NEXT: v_add_f32_e32 v47, 0x38000000, v47 -; GCN-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; GCN-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v1, v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v43, v40 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v45, v45, v3 -; GCN-NEXT: v_or_b32_e32 v43, v46, v2 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v57, v56 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v63, v59, v58 -; GCN-NEXT: v_or_b32_e32 v23, v23, v26 -; GCN-NEXT: v_or_b32_e32 v61, v60, v25 -; GCN-NEXT: v_or_b32_e32 v55, v55, v47 -; GCN-NEXT: v_or_b32_e32 v51, v51, v44 -; GCN-NEXT: v_or_b32_e32 v30, v30, v41 -; GCN-NEXT: v_or_b32_e32 v28, v28, v53 -; GCN-NEXT: v_or_b32_e32 v42, v42, v24 -; GCN-NEXT: v_or_b32_e32 v49, v49, v29 -; GCN-NEXT: v_or_b32_e32 v39, v39, v27 -; GCN-NEXT: v_alignbit_b32 v60, v37, v50, 16 -; GCN-NEXT: v_alignbit_b32 v59, v35, v40, 16 -; GCN-NEXT: v_alignbit_b32 v3, v33, v3, 16 -; GCN-NEXT: v_alignbit_b32 v1, v31, v2, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v56, v15, v56, 16 -; GCN-NEXT: v_alignbit_b32 v2, v11, v58, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v26, v7, v26, 16 -; GCN-NEXT: v_alignbit_b32 v25, v21, v25, 16 -; GCN-NEXT: v_alignbit_b32 v47, v19, v47, 16 -; GCN-NEXT: v_alignbit_b32 v44, v17, v44, 16 -; GCN-NEXT: v_alignbit_b32 v41, v13, v41, 16 -; GCN-NEXT: v_alignbit_b32 v53, v9, v53, 16 -; GCN-NEXT: v_alignbit_b32 v24, v5, v24, 16 -; GCN-NEXT: v_alignbit_b32 v29, v54, v29, 16 -; GCN-NEXT: v_alignbit_b32 v62, v48, v27, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v60 -; GCN-NEXT: v_or_b32_e32 v57, v1, v50 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v38 -; GCN-NEXT: v_or_b32_e32 v37, v1, v37 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 4, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v59 -; GCN-NEXT: v_or_b32_e32 v46, v1, v50 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v36 -; GCN-NEXT: v_or_b32_e32 v35, v1, v35 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v43, v1, v3 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v34 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v31, v31, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v56 -; GCN-NEXT: v_or_b32_e32 v34, v34, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v63 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v27 -; GCN-NEXT: v_or_b32_e32 v40, v40, v45 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v23, v23, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v25, v56, v25 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v21, v21, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_or_b32_e32 v55, v55, v47 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v19, v19, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v51, v51, v44 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v30, v30, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v28, v28, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v42, v24 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v29, v49, v29 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v54, v4 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v62 -; GCN-NEXT: v_or_b32_e32 v27, v39, v27 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v48, v48, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v57, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v60f16_to_v60i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v30 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v45 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:104 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v50 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v56 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v59 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v35 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_mov_b32_e32 v46, v21 +; SI-NEXT: v_mov_b32_e32 v47, v17 +; SI-NEXT: v_mov_b32_e32 v56, v4 +; SI-NEXT: v_mov_b32_e32 v58, v5 +; SI-NEXT: v_mov_b32_e32 v59, v6 +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v4, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v31, v31, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_or_b32_e32 v63, v5, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v11, v11, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_or_b32_e32 v18, v18, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v22, v22, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_or_b32_e32 v7, v7, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_or_b32_e32 v14, v14, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v37, v34, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v59 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v48, v34, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v52, v34, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v55, v34, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v4, v35, v34 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v29, v29, v27 +; SI-NEXT: v_or_b32_e32 v39, v39, v30 +; SI-NEXT: v_or_b32_e32 v33, v33, v32 +; SI-NEXT: v_or_b32_e32 v50, v50, v28 +; SI-NEXT: v_alignbit_b32 v60, v55, v34, 16 +; SI-NEXT: v_alignbit_b32 v27, v22, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v63, v30, 16 +; SI-NEXT: v_alignbit_b32 v32, v31, v32, 16 +; SI-NEXT: v_alignbit_b32 v28, v8, v28, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_or_b32_e32 v4, v35, v1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_alignbit_b32 v1, v52, v1, 16 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_or_b32_e32 v59, v35, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_alignbit_b32 v10, v48, v10, 16 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_or_b32_e32 v58, v45, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v47 +; SI-NEXT: v_alignbit_b32 v13, v37, v13, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v56, v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v51 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v46 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v46, v45, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v47, v51, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v61 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v43 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_or_b32_e32 v61, v43, v45 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v54 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v54 +; SI-NEXT: v_or_b32_e32 v54, v51, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v41 +; SI-NEXT: v_alignbit_b32 v42, v11, v42, 16 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v51 +; SI-NEXT: v_or_b32_e32 v36, v36, v41 +; SI-NEXT: v_alignbit_b32 v51, v25, v35, 16 +; SI-NEXT: v_alignbit_b32 v41, v2, v41, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v4 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_or_b32_e32 v4, v44, v43 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v14, v17, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v7, v21, 16 +; SI-NEXT: v_alignbit_b32 v44, v18, v45, 16 +; SI-NEXT: v_alignbit_b32 v43, v15, v43, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v34, v34, v35 +; SI-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 +; SI-NEXT: v_or_b32_e32 v34, v34, v35 +; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v34, v35, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_add_i32_e32 v34, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60f16_to_v60i16: ; VI: ; %bb.0: @@ -25223,7 +52594,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 @@ -25285,7 +52656,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 ; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 ; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; VI-NEXT: v_or_b32_sdwa v0, v0, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -25411,7 +52782,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v29, v59, v29, s6 @@ -25505,7 +52876,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v28 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v29 -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v18, v40, v18, s4 @@ -25561,7 +52932,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] @@ -25593,7 +52964,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -25635,7 +53006,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v29, v84, v29, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v28, v83, v28, 0x5040100 @@ -25727,7 +53098,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v27 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v28 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v29 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: .LBB58_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v31, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v32, v1, 0x5040100 @@ -25776,3 +53147,1659 @@ end: %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <60 x i16> %phi } + +define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60f16_to_v60i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v45 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v46 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v47 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v57 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v59 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v61 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v47, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v45, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v46, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB59_3 +; SI-NEXT: .LBB59_2: +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: .LBB59_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v61, v14 +; SI-NEXT: v_mov_b32_e32 v63, v15 +; SI-NEXT: v_mov_b32_e32 v15, v18 +; SI-NEXT: v_mov_b32_e32 v18, v22 +; SI-NEXT: v_mov_b32_e32 v22, v33 +; SI-NEXT: v_mov_b32_e32 v33, v11 +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_mov_b32_e32 v8, v5 +; SI-NEXT: v_mov_b32_e32 v5, v42 +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: s_cbranch_vccnz .LBB59_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_or_b32_e32 v62, v1, v39 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_or_b32_e32 v6, v6, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_or_b32_e32 v16, v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v20, v20, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v35 +; SI-NEXT: v_or_b32_e32 v34, v34, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v38 +; SI-NEXT: v_or_b32_e32 v37, v37, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v48, v39, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v52 +; SI-NEXT: v_or_b32_e32 v51, v39, v50 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v27 +; SI-NEXT: v_or_b32_e32 v26, v26, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v29, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_or_b32_e32 v44, v39, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v47 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v47, v50, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v58 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_or_b32_e32 v46, v50, v53 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v58 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v14, v58, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v63 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v63, v58, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v61 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v43, v43, v50 +; SI-NEXT: v_or_b32_e32 v28, v28, v57 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_or_b32_e32 v8, v8, v41 +; SI-NEXT: v_or_b32_e32 v11, v11, v40 +; SI-NEXT: v_or_b32_e32 v33, v33, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v25 +; SI-NEXT: v_or_b32_e32 v15, v15, v24 +; SI-NEXT: v_or_b32_e32 v61, v58, v19 +; SI-NEXT: v_or_b32_e32 v1, v36, v14 +; SI-NEXT: v_alignbit_b32 v60, v44, v39, 16 +; SI-NEXT: v_alignbit_b32 v59, v29, v53, 16 +; SI-NEXT: v_alignbit_b32 v58, v26, v50, 16 +; SI-NEXT: v_alignbit_b32 v57, v51, v57, 16 +; SI-NEXT: v_alignbit_b32 v56, v48, v56, 16 +; SI-NEXT: v_alignbit_b32 v42, v37, v42, 16 +; SI-NEXT: v_alignbit_b32 v41, v34, v41, 16 +; SI-NEXT: v_alignbit_b32 v40, v31, v40, 16 +; SI-NEXT: v_alignbit_b32 v55, v20, v55, 16 +; SI-NEXT: v_alignbit_b32 v54, v16, v54, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v25, 16 +; SI-NEXT: v_alignbit_b32 v24, v9, v24, 16 +; SI-NEXT: v_alignbit_b32 v23, v6, v23, 16 +; SI-NEXT: v_alignbit_b32 v19, v3, v19, 16 +; SI-NEXT: v_alignbit_b32 v36, v62, v14, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: .LBB59_5: ; %end +; SI-NEXT: v_and_b32_e32 v39, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v60 +; SI-NEXT: v_or_b32_e32 v39, v39, v50 +; SI-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v39, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v45 +; SI-NEXT: v_or_b32_e32 v39, v39, v50 +; SI-NEXT: v_add_i32_e32 v50, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v39, v50, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v39, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v59 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v39, v39, v50 +; SI-NEXT: v_add_i32_e32 v50, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v39, v50, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v58 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v57 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v52 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v56 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v42 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v5, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v5, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v55 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v54 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v60f16_to_v60i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v54, 0x200 +; VI-NEXT: v_add_f16_e32 v32, s16, v54 +; VI-NEXT: v_add_f16_e32 v59, s43, v54 +; VI-NEXT: v_add_f16_e32 v33, s17, v54 +; VI-NEXT: v_add_f16_e32 v58, s42, v54 +; VI-NEXT: v_add_f16_e32 v34, s18, v54 +; VI-NEXT: v_add_f16_e32 v57, s41, v54 +; VI-NEXT: v_add_f16_e32 v35, s19, v54 +; VI-NEXT: v_add_f16_e32 v56, s40, v54 +; VI-NEXT: v_add_f16_e32 v36, s20, v54 +; VI-NEXT: v_add_f16_e32 v47, s15, v54 +; VI-NEXT: v_add_f16_e32 v37, s21, v54 +; VI-NEXT: v_add_f16_e32 v46, s14, v54 +; VI-NEXT: v_add_f16_e32 v38, s22, v54 +; VI-NEXT: v_add_f16_e32 v45, s13, v54 +; VI-NEXT: v_add_f16_e32 v39, s23, v54 +; VI-NEXT: v_add_f16_e32 v44, s12, v54 +; VI-NEXT: v_add_f16_e32 v48, s24, v54 +; VI-NEXT: v_add_f16_e32 v43, s11, v54 +; VI-NEXT: v_add_f16_e32 v49, s25, v54 +; VI-NEXT: v_add_f16_e32 v42, s10, v54 +; VI-NEXT: v_add_f16_e32 v50, s26, v54 +; VI-NEXT: v_add_f16_e32 v41, s9, v54 +; VI-NEXT: v_add_f16_e32 v51, s27, v54 +; VI-NEXT: v_add_f16_e32 v40, s8, v54 +; VI-NEXT: v_add_f16_e32 v52, s28, v54 +; VI-NEXT: v_add_f16_e32 v55, s7, v54 +; VI-NEXT: v_add_f16_e32 v53, s29, v54 +; VI-NEXT: v_add_f16_e32 v54, s6, v54 +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 +; VI-NEXT: s_branch .LBB59_5 +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v54, s6 +; VI-NEXT: v_mov_b32_e32 v53, s29 +; VI-NEXT: v_mov_b32_e32 v55, s7 +; VI-NEXT: v_mov_b32_e32 v52, s28 +; VI-NEXT: v_mov_b32_e32 v40, s8 +; VI-NEXT: v_mov_b32_e32 v51, s27 +; VI-NEXT: v_mov_b32_e32 v41, s9 +; VI-NEXT: v_mov_b32_e32 v50, s26 +; VI-NEXT: v_mov_b32_e32 v42, s10 +; VI-NEXT: v_mov_b32_e32 v49, s25 +; VI-NEXT: v_mov_b32_e32 v43, s11 +; VI-NEXT: v_mov_b32_e32 v48, s24 +; VI-NEXT: v_mov_b32_e32 v44, s12 +; VI-NEXT: v_mov_b32_e32 v39, s23 +; VI-NEXT: v_mov_b32_e32 v45, s13 +; VI-NEXT: v_mov_b32_e32 v38, s22 +; VI-NEXT: v_mov_b32_e32 v46, s14 +; VI-NEXT: v_mov_b32_e32 v37, s21 +; VI-NEXT: v_mov_b32_e32 v47, s15 +; VI-NEXT: v_mov_b32_e32 v36, s20 +; VI-NEXT: v_mov_b32_e32 v56, s40 +; VI-NEXT: v_mov_b32_e32 v35, s19 +; VI-NEXT: v_mov_b32_e32 v57, s41 +; VI-NEXT: v_mov_b32_e32 v34, s18 +; VI-NEXT: v_mov_b32_e32 v58, s42 +; VI-NEXT: v_mov_b32_e32 v33, s17 +; VI-NEXT: v_mov_b32_e32 v59, s43 +; VI-NEXT: v_mov_b32_e32 v32, s16 +; VI-NEXT: .LBB59_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v32, v32, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v33, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v34, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v35, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v36, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v37, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v38, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v39, v39, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v48, v48, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v49, v49, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v50, v50, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v51, v51, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v0, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; VI-NEXT: v_or_b32_sdwa v52, v52, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v53, v53, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v32 +; VI-NEXT: v_mov_b32_e32 v1, v33 +; VI-NEXT: v_mov_b32_e32 v2, v34 +; VI-NEXT: v_mov_b32_e32 v3, v35 +; VI-NEXT: v_mov_b32_e32 v4, v36 +; VI-NEXT: v_mov_b32_e32 v5, v37 +; VI-NEXT: v_mov_b32_e32 v6, v38 +; VI-NEXT: v_mov_b32_e32 v7, v39 +; VI-NEXT: v_mov_b32_e32 v8, v48 +; VI-NEXT: v_mov_b32_e32 v9, v49 +; VI-NEXT: v_mov_b32_e32 v10, v50 +; VI-NEXT: v_mov_b32_e32 v11, v51 +; VI-NEXT: v_mov_b32_e32 v12, v52 +; VI-NEXT: v_mov_b32_e32 v13, v53 +; VI-NEXT: v_mov_b32_e32 v14, v30 +; VI-NEXT: v_mov_b32_e32 v15, v31 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v60f16_to_v60i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v15, v29, 16, v15 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v28, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v13, v27, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v12, v26, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v11, v55, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v10, v54, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v25, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v8, v24, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v22, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v21, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v19, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v18, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v16, 16, v0 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_mov_b32_e32 v16, 0x200 +; GFX9-NEXT: v_pk_add_f16 v30, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_f16 v31, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_f16 v51, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_f16 v50, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_f16 v49, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_f16 v48, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_f16 v39, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_f16 v38, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_f16 v37, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_f16 v36, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_f16 v35, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_pk_add_f16 v34, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_pk_add_f16 v33, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_f16 v32, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: s_branch .LBB59_5 +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v30, s29 +; GFX9-NEXT: v_mov_b32_e32 v31, s28 +; GFX9-NEXT: v_mov_b32_e32 v51, s27 +; GFX9-NEXT: v_mov_b32_e32 v50, s26 +; GFX9-NEXT: v_mov_b32_e32 v49, s25 +; GFX9-NEXT: v_mov_b32_e32 v48, s24 +; GFX9-NEXT: v_mov_b32_e32 v39, s23 +; GFX9-NEXT: v_mov_b32_e32 v38, s22 +; GFX9-NEXT: v_mov_b32_e32 v37, s21 +; GFX9-NEXT: v_mov_b32_e32 v36, s20 +; GFX9-NEXT: v_mov_b32_e32 v35, s19 +; GFX9-NEXT: v_mov_b32_e32 v34, s18 +; GFX9-NEXT: v_mov_b32_e32 v33, s17 +; GFX9-NEXT: v_mov_b32_e32 v32, s16 +; GFX9-NEXT: v_mov_b32_e32 v53, s43 +; GFX9-NEXT: v_mov_b32_e32 v52, s42 +; GFX9-NEXT: v_mov_b32_e32 v40, s41 +; GFX9-NEXT: v_mov_b32_e32 v41, s40 +; GFX9-NEXT: v_mov_b32_e32 v42, s15 +; GFX9-NEXT: v_mov_b32_e32 v43, s14 +; GFX9-NEXT: v_mov_b32_e32 v44, s13 +; GFX9-NEXT: v_mov_b32_e32 v45, s12 +; GFX9-NEXT: v_mov_b32_e32 v46, s11 +; GFX9-NEXT: v_mov_b32_e32 v47, s10 +; GFX9-NEXT: v_mov_b32_e32 v56, s9 +; GFX9-NEXT: v_mov_b32_e32 v57, s8 +; GFX9-NEXT: v_mov_b32_e32 v58, s7 +; GFX9-NEXT: v_mov_b32_e32 v59, s6 +; GFX9-NEXT: .LBB59_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v51, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v32, v59, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v33, v58, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v34, v57, 16, v34 +; GFX9-NEXT: v_lshl_or_b32 v35, v56, 16, v35 +; GFX9-NEXT: v_lshl_or_b32 v36, v47, 16, v36 +; GFX9-NEXT: v_lshl_or_b32 v37, v46, 16, v37 +; GFX9-NEXT: v_lshl_or_b32 v38, v45, 16, v38 +; GFX9-NEXT: v_lshl_or_b32 v39, v44, 16, v39 +; GFX9-NEXT: v_lshl_or_b32 v48, v43, 16, v48 +; GFX9-NEXT: v_lshl_or_b32 v49, v42, 16, v49 +; GFX9-NEXT: v_lshl_or_b32 v50, v41, 16, v50 +; GFX9-NEXT: v_lshl_or_b32 v51, v40, 16, v51 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v30, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v52, v52, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v31, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v21, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v22, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v25, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v33 +; GFX9-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-NEXT: v_mov_b32_e32 v3, v35 +; GFX9-NEXT: v_mov_b32_e32 v4, v36 +; GFX9-NEXT: v_mov_b32_e32 v5, v37 +; GFX9-NEXT: v_mov_b32_e32 v6, v38 +; GFX9-NEXT: v_mov_b32_e32 v7, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v48 +; GFX9-NEXT: v_mov_b32_e32 v9, v49 +; GFX9-NEXT: v_mov_b32_e32 v10, v50 +; GFX9-NEXT: v_mov_b32_e32 v11, v51 +; GFX9-NEXT: v_mov_b32_e32 v12, v52 +; GFX9-NEXT: v_mov_b32_e32 v13, v53 +; GFX9-NEXT: v_mov_b32_e32 v14, v30 +; GFX9-NEXT: v_mov_b32_e32 v15, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v60i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v14, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v12, 16, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v13, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v14, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v12, 16, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v14, 16, v1 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v13, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v15, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v12, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s15, s11 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v12, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v13, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v15, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s14, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s11, s9 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s10, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s9, s7 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v37, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v36, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v65.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v67.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v68.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v69.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v71.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v80.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v81.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v83.l +; GFX11-TRUE16-NEXT: s_branch .LBB59_5 +; GFX11-TRUE16-NEXT: .LBB59_3: +; GFX11-TRUE16-NEXT: s_branch .LBB59_2 +; GFX11-TRUE16-NEXT: .LBB59_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s13 +; GFX11-TRUE16-NEXT: .LBB59_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v38, 16, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v39, 16, v64 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v39, v49, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v35, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v29, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v49, v33, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v38, v48, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v54, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v48, v55, 16, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v50, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v52, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v34, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v28, 16, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v27, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v19, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_and_b32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v53, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.h +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v51, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v36 :: v_dual_mov_b32 v1, v37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_mov_b32 v8, v32 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v33 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, v35 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v60i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v28, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v34, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v35, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v30, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v31, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v33, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v49, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v48, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v39, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v38, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v37, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v36, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: s_branch .LBB59_5 +; GFX11-FAKE16-NEXT: .LBB59_3: +; GFX11-FAKE16-NEXT: s_branch .LBB59_2 +; GFX11-FAKE16-NEXT: .LBB59_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v33, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v49, s17 :: v_dual_mov_b32 v36, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v38, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v39, s1 :: v_dual_mov_b32 v48, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s45 :: v_dual_mov_b32 v51, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s43 :: v_dual_mov_b32 v53, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s41 :: v_dual_mov_b32 v55, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s15 :: v_dual_mov_b32 v65, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s13 :: v_dual_mov_b32 v67, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s11 :: v_dual_mov_b32 v69, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s9 :: v_dual_mov_b32 v71, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s6 :: v_dual_mov_b32 v81, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s5 +; GFX11-FAKE16-NEXT: .LBB59_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v70, 16, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v84, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v82, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v80, 16, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v22, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v81, 16, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v83, 16, v48 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v71, 16, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v69, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v68, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v66, 16, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v67, 16, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v65, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v64, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v53, 16, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v35 :: v_dual_and_b32 v4, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v52, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v34 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <60 x half> %a, splat (half 0xH0200) + %a2 = bitcast <60 x half> %a1 to <60 x i16> + br label %end + +cmp.false: + %a3 = bitcast <60 x half> %a to <60 x i16> + br label %end + +end: + %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x i16> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index 4ae7c88e7eb45..8d945ea75e761 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -1,27 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <3 x float> @bitcast_v3i32_to_v3f32(<3 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i32_to_v3f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i32_to_v3f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i32_to_v3f32: ; VI: ; %bb.0: @@ -85,22 +84,116 @@ end: ret <3 x float> %phi } +define inreg <3 x float> @bitcast_v3i32_to_v3f32_scalar(<3 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i32_to_v3f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v3i32_to_v3f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v3i32_to_v3f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v3i32_to_v3f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i32> %a, splat (i32 3) + %a2 = bitcast <3 x i32> %a1 to <3 x float> + br label %end + +cmp.false: + %a3 = bitcast <3 x i32> %a to <3 x float> + br label %end + +end: + %phi = phi <3 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x float> %phi +} + define <3 x i32> @bitcast_v3f32_to_v3i32(<3 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f32_to_v3i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f32_to_v3i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v3i32: ; VI: ; %bb.0: @@ -163,58 +256,156 @@ end: ret <3 x i32> %phi } +define inreg <3 x i32> @bitcast_v3f32_to_v3i32_scalar(<3 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f32_to_v3i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f32_to_v3i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f32_to_v3i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f32_to_v3i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <3 x float> %a1 to <3 x i32> + br label %end + +cmp.false: + %a3 = bitcast <3 x float> %a to <3 x i32> + br label %end + +end: + %phi = phi <3 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i32> %phi +} + define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i32_to_v12i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB2_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, s4, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, s4, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 8 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, s4, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, s4, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i32_to_v12i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_4 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB4_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, s4, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: .LBB4_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, s4, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i32_to_v12i8: ; VI: ; %bb.0: @@ -234,7 +425,7 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -245,9 +436,9 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; VI-NEXT: .LBB2_2: ; %Flow +; VI-NEXT: .LBB4_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_4 +; VI-NEXT: s_cbranch_execz .LBB4_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 @@ -261,7 +452,7 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v14 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; VI-NEXT: .LBB2_4: ; %end +; VI-NEXT: .LBB4_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v13 ; VI-NEXT: v_mov_b32_e32 v4, v14 @@ -285,7 +476,7 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -296,9 +487,9 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: .LBB2_2: ; %Flow +; GFX9-NEXT: .LBB4_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: s_cbranch_execz .LBB4_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 ; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 @@ -312,7 +503,7 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: .LBB2_4: ; %end +; GFX9-NEXT: .LBB4_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v13 ; GFX9-NEXT: v_mov_b32_e32 v4, v14 @@ -342,7 +533,7 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v11 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB4_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 @@ -355,7 +546,7 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v11 -; GFX11-TRUE16-NEXT: .LBB2_4: ; %end +; GFX11-TRUE16-NEXT: .LBB4_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h @@ -384,7 +575,7 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -395,9 +586,9 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX11-FAKE16-NEXT: .LBB2_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB4_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB2_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB4_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 @@ -412,7 +603,7 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v14 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v13 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX11-FAKE16-NEXT: .LBB2_4: ; %end +; GFX11-FAKE16-NEXT: .LBB4_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v13 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v14 @@ -434,103 +625,391 @@ end: ret <12 x i8> %phi } +define inreg <12 x i8> @bitcast_v3i32_to_v12i8_scalar(<3 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i32_to_v12i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s4, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s4, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s4, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s4, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v3i32_to_v12i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s19, s16, 8 +; VI-NEXT: s_lshr_b32 s10, s18, 16 +; VI-NEXT: s_lshr_b32 s11, s18, 8 +; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s13, s17, 16 +; VI-NEXT: s_lshr_b32 s14, s17, 8 +; VI-NEXT: s_lshr_b32 s15, s16, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshr_b32 s19, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s10, s18, 16 +; VI-NEXT: s_lshr_b32 s11, s18, 8 +; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s13, s17, 16 +; VI-NEXT: s_lshr_b32 s14, s17, 8 +; VI-NEXT: s_lshr_b32 s15, s16, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s19 +; VI-NEXT: v_mov_b32_e32 v2, s15 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s11 +; VI-NEXT: v_mov_b32_e32 v10, s10 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: ; implicit-def: $sgpr19 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v3i32_to_v12i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s19, s16, 8 +; GFX9-NEXT: s_lshr_b32 s10, s18, 16 +; GFX9-NEXT: s_lshr_b32 s11, s18, 8 +; GFX9-NEXT: s_lshr_b32 s12, s17, 24 +; GFX9-NEXT: s_lshr_b32 s13, s17, 16 +; GFX9-NEXT: s_lshr_b32 s14, s17, 8 +; GFX9-NEXT: s_lshr_b32 s15, s16, 16 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_lshr_b32 s19, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s10, s18, 16 +; GFX9-NEXT: s_lshr_b32 s11, s18, 8 +; GFX9-NEXT: s_lshr_b32 s12, s17, 24 +; GFX9-NEXT: s_lshr_b32 s13, s17, 16 +; GFX9-NEXT: s_lshr_b32 s14, s17, 8 +; GFX9-NEXT: s_lshr_b32 s15, s16, 16 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: v_mov_b32_e32 v10, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: ; implicit-def: $sgpr19 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-TRUE16-LABEL: bitcast_v3i32_to_v12i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s14 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-TRUE16-NEXT: .LBB5_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: .LBB5_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s6 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB5_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB5_2 +; +; GFX11-FAKE16-LABEL: bitcast_v3i32_to_v12i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s14 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-FAKE16-NEXT: .LBB5_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: .LBB5_3: ; %end +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s12 :: v_dual_mov_b32 v3, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v11, s6 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB5_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i32> %a, splat (i32 3) + %a2 = bitcast <3 x i32> %a1 to <12 x i8> + br label %end + +cmp.false: + %a3 = bitcast <3 x i32> %a to <12 x i8> + br label %end + +end: + %phi = phi <12 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i8> %phi +} + define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i8_to_v3i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v6, v6, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v5, v7, v8 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v0, v12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v15, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v6, v9, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x300, v6 -; GCN-NEXT: v_or_b32_e32 v5, v7, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i8_to_v3i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i8_to_v3i32: ; VI: ; %bb.0: @@ -547,14 +1026,14 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: s_cbranch_execnz .LBB6_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB3_4 -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB6_4 +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB3_3: ; %cmp.false +; VI-NEXT: .LBB6_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -577,8 +1056,8 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 -; VI-NEXT: .LBB3_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: .LBB6_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v13 ; VI-NEXT: v_add_u16_e32 v1, 3, v14 ; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -619,14 +1098,14 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: s_cbranch_execnz .LBB6_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB3_4 -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB6_4 +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB3_3: ; %cmp.false +; GFX9-NEXT: .LBB6_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -649,8 +1128,8 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 -; GFX9-NEXT: .LBB3_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: .LBB6_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v13 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v14 ; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -693,14 +1172,14 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB3_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB6_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB3_4 -; GFX11-TRUE16-NEXT: .LBB3_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB6_4 +; GFX11-TRUE16-NEXT: .LBB6_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB3_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB6_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h @@ -737,8 +1216,8 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB3_2 -; GFX11-TRUE16-NEXT: .LBB3_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: .LBB6_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v7.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3 @@ -792,14 +1271,14 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB3_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB6_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB3_4 -; GFX11-FAKE16-NEXT: .LBB3_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB6_4 +; GFX11-FAKE16-NEXT: .LBB6_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB3_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB6_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -835,8 +1314,8 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB3_2 -; GFX11-FAKE16-NEXT: .LBB3_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: .LBB6_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v13, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v14, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -890,53 +1369,403 @@ end: ret <3 x i32> %phi } +define inreg <3 x i32> @bitcast_v12i8_to_v3i32_scalar(<12 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i8_to_v3i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v12i8_to_v3i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s8, s17, 8 +; VI-NEXT: s_and_b32 s9, s16, 0xff +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s22, 0xff +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s7, s27, 24 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v12i8_to_v3i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v12i8_to_v3i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s8 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_lshl_b32 s6, s17, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s5, s16, 0xff +; GFX11-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s8, s9 +; GFX11-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-NEXT: s_and_b32 s10, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s23, 8 +; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_or_b32 s9, s10, s11 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s8, s9 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_3 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s19, 8 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-NEXT: s_and_b32 s5, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s23, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s6, s5 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_or_b32 s6, s2, s3 +; GFX11-NEXT: .LBB7_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; GFX11-NEXT: s_branch .LBB7_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i8> %a, splat (i8 3) + %a2 = bitcast <12 x i8> %a1 to <3 x i32> + br label %end + +cmp.false: + %a3 = bitcast <12 x i8> %a to <3 x i32> + br label %end + +end: + %phi = phi <3 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i32> %phi +} + define <6 x bfloat> @bitcast_v3i32_to_v6bf16(<3 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i32_to_v6bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v7, v1 -; GCN-NEXT: v_mov_b32_e32 v6, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_4 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB4_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: .LBB4_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i32_to_v6bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_4 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB8_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: .LBB8_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i32_to_v6bf16: ; VI: ; %bb.0: @@ -1000,63 +1829,179 @@ end: ret <6 x bfloat> %phi } +define inreg <6 x bfloat> @bitcast_v3i32_to_v6bf16_scalar(<3 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i32_to_v6bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s18, 16 +; SI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s17, 16 +; SI-NEXT: s_and_b32 s10, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s18, 16 +; SI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s17, 16 +; SI-NEXT: s_and_b32 s10, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s16, 16 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v3i32_to_v6bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v3i32_to_v6bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v3i32_to_v6bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i32> %a, splat (i32 3) + %a2 = bitcast <3 x i32> %a1 to <6 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <3 x i32> %a to <6 x bfloat> + br label %end + +end: + %phi = phi <6 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x bfloat> %phi +} + define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v6bf16_to_v3i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_4 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB5_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: .LBB5_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v3, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6bf16_to_v3i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v9, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v7, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6bf16_to_v3i32: ; VI: ; %bb.0: @@ -1065,7 +2010,7 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -1122,7 +2067,7 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1133,7 +2078,7 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -1182,7 +2127,7 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX9-NEXT: v_perm_b32 v0, v3, v0, s7 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1194,7 +2139,7 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1252,7 +2197,7 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 -; GFX11-TRUE16-NEXT: .LBB5_2: ; %end +; GFX11-TRUE16-NEXT: .LBB10_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1264,7 +2209,7 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1317,7 +2262,7 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB5_2: ; %end +; GFX11-FAKE16-NEXT: .LBB10_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1337,59 +2282,352 @@ end: ret <3 x i32> %phi } +define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6bf16_to_v3i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v6bf16_to_v3i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6bf16_to_v3i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v1 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v0 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6bf16_to_v3i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s2 +; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s2 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_and_b32 v1, 0xffff, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v9 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v2, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v4, 16, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <6 x bfloat> %a1 to <3 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x bfloat> %a to <3 x i32> + br label %end + +end: + %phi = phi <3 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i32> %phi +} + define <6 x half> @bitcast_v3i32_to_v6f16(<3 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i32_to_v6f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v7, v1 -; GCN-NEXT: v_mov_b32_e32 v6, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_4 -; GCN-NEXT: .LBB6_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB6_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: .LBB6_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i32_to_v6f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_4 +; SI-NEXT: .LBB12_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB12_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: .LBB12_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i32_to_v6f16: ; VI: ; %bb.0: @@ -1453,78 +2691,194 @@ end: ret <6 x half> %phi } -define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f16_to_v3i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_or_b32_e32 v0, v7, v0 -; GCN-NEXT: v_or_b32_e32 v1, v6, v1 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v4, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v5 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <6 x half> @bitcast_v3i32_to_v6f16_scalar(<3 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i32_to_v6f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB13_2 ; -; VI-LABEL: bitcast_v6f16_to_v3i32: +; VI-LABEL: bitcast_v3i32_to_v6f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v3i32_to_v6f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_v3i32_to_v6f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i32> %a, splat (i32 3) + %a2 = bitcast <3 x i32> %a1 to <6 x half> + br label %end + +cmp.false: + %a3 = bitcast <3 x i32> %a to <6 x half> + br label %end + +end: + %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x half> %phi +} + +define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v6f16_to_v3i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f16_to_v3i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 0x200 ; VI-NEXT: v_add_f16_sdwa v4, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1536,7 +2890,7 @@ define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v3 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1588,40 +2942,182 @@ end: ret <3 x i32> %phi } +define inreg <3 x i32> @bitcast_v6f16_to_v3i32_scalar(<6 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f16_to_v3i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v6f16_to_v3i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_4 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v3, v1 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_sdwa v3, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_3: +; VI-NEXT: s_branch .LBB15_2 +; VI-NEXT: .LBB15_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f16_to_v3i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f16_to_v3i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x half> %a, splat (half 0xH0200) + %a2 = bitcast <6 x half> %a1 to <3 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x half> %a to <3 x i32> + br label %end + +end: + %phi = phi <3 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i32> %phi +} + define <6 x i16> @bitcast_v3i32_to_v6i16(<3 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i32_to_v6i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v5, s4, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v5, s4, v4, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i32_to_v6i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i32_to_v6i16: ; VI: ; %bb.0: @@ -1685,56 +3181,165 @@ end: ret <6 x i16> %phi } +define inreg <6 x i16> @bitcast_v3i32_to_v6i16_scalar(<3 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i32_to_v6i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s4, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v3i32_to_v6i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v3i32_to_v6i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v3i32_to_v6i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i32> %a, splat (i32 3) + %a2 = bitcast <3 x i32> %a1 to <6 x i16> + br label %end + +cmp.false: + %a3 = bitcast <3 x i32> %a to <6 x i16> + br label %end + +end: + %phi = phi <6 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i16> %phi +} + define <3 x i32> @bitcast_v6i16_to_v3i32(<6 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i16_to_v3i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v7, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v6 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v6, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i16_to_v3i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i16_to_v3i32: ; VI: ; %bb.0: @@ -1743,7 +3348,7 @@ define <3 x i32> @bitcast_v6i16_to_v3i32(<6 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 3 ; VI-NEXT: v_add_u16_e32 v3, 3, v2 @@ -1755,7 +3360,7 @@ define <3 x i32> @bitcast_v6i16_to_v3i32(<6 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v3, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1806,58 +3411,188 @@ end: ret <3 x i32> %phi } +define inreg <3 x i32> @bitcast_v6i16_to_v3i32_scalar(<6 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i16_to_v3i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v6i16_to_v3i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v6i16_to_v3i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i16_to_v3i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i16> %a, splat (i16 3) + %a2 = bitcast <6 x i16> %a1 to <3 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x i16> %a to <3 x i32> + br label %end + +end: + %phi = phi <3 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i32> %phi +} + define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f32_to_v12i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, s4, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, s4, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 8 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, s4, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, s4, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f32_to_v12i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, s4, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, s4, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v12i8: ; VI: ; %bb.0: @@ -1877,7 +3612,7 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -1888,9 +3623,9 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; VI-NEXT: .LBB10_2: ; %Flow +; VI-NEXT: .LBB20_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_4 +; VI-NEXT: s_cbranch_execz .LBB20_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -1904,7 +3639,7 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v14 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; VI-NEXT: .LBB10_4: ; %end +; VI-NEXT: .LBB20_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v13 ; VI-NEXT: v_mov_b32_e32 v4, v14 @@ -1928,7 +3663,7 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -1939,9 +3674,9 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: .LBB10_2: ; %Flow +; GFX9-NEXT: .LBB20_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_4 +; GFX9-NEXT: s_cbranch_execz .LBB20_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -1955,7 +3690,7 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: .LBB10_4: ; %end +; GFX9-NEXT: .LBB20_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v13 ; GFX9-NEXT: v_mov_b32_e32 v4, v14 @@ -1985,7 +3720,7 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v11 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v11, 1.0, v11 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -1997,7 +3732,7 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v11 -; GFX11-TRUE16-NEXT: .LBB10_4: ; %end +; GFX11-TRUE16-NEXT: .LBB20_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h @@ -2026,7 +3761,7 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -2037,9 +3772,9 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX11-FAKE16-NEXT: .LBB10_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB10_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 1.0, v8 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v13, 1.0, v13 @@ -2053,7 +3788,7 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v14 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v13 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX11-FAKE16-NEXT: .LBB10_4: ; %end +; GFX11-FAKE16-NEXT: .LBB20_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v13 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v14 @@ -2075,103 +3810,411 @@ end: ret <12 x i8> %phi } +define inreg <12 x i8> @bitcast_v3f32_to_v12i8_scalar(<3 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f32_to_v12i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s4, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s4, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB21_4 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v8, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, s4, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB21_2 +; SI-NEXT: .LBB21_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f32_to_v12i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s19, s16, 8 +; VI-NEXT: s_lshr_b32 s10, s18, 16 +; VI-NEXT: s_lshr_b32 s11, s18, 8 +; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s13, s17, 16 +; VI-NEXT: s_lshr_b32 s15, s17, 8 +; VI-NEXT: s_lshr_b32 s14, s16, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB21_4 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v8, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v14, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v13, s16, 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; VI-NEXT: s_branch .LBB21_5 +; VI-NEXT: .LBB21_3: +; VI-NEXT: ; implicit-def: $sgpr19 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB21_2 +; VI-NEXT: .LBB21_4: +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: v_mov_b32_e32 v14, s17 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v1, s19 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v5, s15 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v9, s11 +; VI-NEXT: v_mov_b32_e32 v10, s10 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB21_5: ; %end +; VI-NEXT: v_mov_b32_e32 v0, v13 +; VI-NEXT: v_mov_b32_e32 v4, v14 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f32_to_v12i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s19, s16, 8 +; GFX9-NEXT: s_lshr_b32 s10, s18, 16 +; GFX9-NEXT: s_lshr_b32 s11, s18, 8 +; GFX9-NEXT: s_lshr_b32 s12, s17, 24 +; GFX9-NEXT: s_lshr_b32 s13, s17, 16 +; GFX9-NEXT: s_lshr_b32 s15, s17, 8 +; GFX9-NEXT: s_lshr_b32 s14, s16, 16 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB21_4 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v8, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v14, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v13, s16, 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: s_branch .LBB21_5 +; GFX9-NEXT: .LBB21_3: +; GFX9-NEXT: ; implicit-def: $sgpr19 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB21_2 +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: v_mov_b32_e32 v14, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: v_mov_b32_e32 v10, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB21_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v13 +; GFX9-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v3f32_to_v12i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-TRUE16-NEXT: .LBB21_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, s2, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s0, 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[8:9] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB21_5 +; GFX11-TRUE16-NEXT: .LBB21_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB21_2 +; GFX11-TRUE16-NEXT: .LBB21_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s4 +; GFX11-TRUE16-NEXT: .LBB21_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v3f32_to_v12i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-FAKE16-NEXT: .LBB21_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, s2, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, s0, 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX11-FAKE16-NEXT: s_branch .LBB21_5 +; GFX11-FAKE16-NEXT: .LBB21_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB21_2 +; GFX11-FAKE16-NEXT: .LBB21_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v1, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v10, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB21_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v14 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <3 x float> %a1 to <12 x i8> + br label %end + +cmp.false: + %a3 = bitcast <3 x float> %a to <12 x i8> + br label %end + +end: + %phi = phi <12 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i8> %phi +} + define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i8_to_v3f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_4 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB11_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v6, v6, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v5, v7, v8 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: .LBB11_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v0, v12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v15, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v6, v9, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x300, v6 -; GCN-NEXT: v_or_b32_e32 v5, v7, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i8_to_v3f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_4 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB22_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: .LBB22_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i8_to_v3f32: ; VI: ; %bb.0: @@ -2188,14 +4231,14 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: s_cbranch_execnz .LBB22_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB11_4 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB22_4 +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB11_3: ; %cmp.false +; VI-NEXT: .LBB22_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2218,8 +4261,8 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 -; VI-NEXT: .LBB11_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: .LBB22_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v13 ; VI-NEXT: v_add_u16_e32 v1, 3, v14 ; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -2260,14 +4303,14 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: s_cbranch_execnz .LBB22_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB11_4 -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB22_4 +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB11_3: ; %cmp.false +; GFX9-NEXT: .LBB22_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2290,8 +4333,8 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 -; GFX9-NEXT: .LBB11_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: .LBB22_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v13 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v14 ; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -2334,14 +4377,14 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB22_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_4 -; GFX11-TRUE16-NEXT: .LBB11_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB22_4 +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB11_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h @@ -2378,8 +4421,8 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB11_2 -; GFX11-TRUE16-NEXT: .LBB11_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v7.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3 @@ -2433,14 +4476,14 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB22_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_4 -; GFX11-FAKE16-NEXT: .LBB11_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB22_4 +; GFX11-FAKE16-NEXT: .LBB22_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB11_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -2476,8 +4519,8 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB11_2 -; GFX11-FAKE16-NEXT: .LBB11_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: .LBB22_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v13, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v14, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -2531,67 +4574,417 @@ end: ret <3 x float> %phi } -define <6 x bfloat> @bitcast_v3f32_to_v6bf16(<3 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f32_to_v6bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v7, v1 -; GCN-NEXT: v_mov_b32_e32 v6, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB12_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB12_4 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB12_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: .LBB12_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <3 x float> @bitcast_v12i8_to_v3f32_scalar(<12 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i8_to_v3f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; SI-NEXT: s_branch .LBB23_2 ; -; VI-LABEL: bitcast_v3f32_to_v6bf16: +; VI-LABEL: bitcast_v12i8_to_v3f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s8, s17, 8 +; VI-NEXT: s_and_b32 s9, s16, 0xff +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s22, 0xff +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s7, s27, 24 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v12i8_to_v3f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v12i8_to_v3f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s8 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_lshl_b32 s6, s17, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s5, s16, 0xff +; GFX11-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s8, s9 +; GFX11-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-NEXT: s_and_b32 s10, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s23, 8 +; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_or_b32 s9, s10, s11 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s8, s9 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s19, 8 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-NEXT: s_and_b32 s5, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s23, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s6, s5 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_or_b32 s6, s2, s3 +; GFX11-NEXT: .LBB23_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; GFX11-NEXT: s_branch .LBB23_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i8> %a, splat (i8 3) + %a2 = bitcast <12 x i8> %a1 to <3 x float> + br label %end + +cmp.false: + %a3 = bitcast <12 x i8> %a to <3 x float> + br label %end + +end: + %phi = phi <3 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x float> %phi +} + +define <6 x bfloat> @bitcast_v3f32_to_v6bf16(<3 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v3f32_to_v6bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB24_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB24_4 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB24_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: .LBB24_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f32_to_v6bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v3f32_to_v6bf16: @@ -2640,63 +5033,184 @@ end: ret <6 x bfloat> %phi } +define inreg <6 x bfloat> @bitcast_v3f32_to_v6bf16_scalar(<3 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f32_to_v6bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB25_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s18, 16 +; SI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s17, 16 +; SI-NEXT: s_and_b32 s10, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB25_4 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_3: +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB25_2 +; SI-NEXT: .LBB25_4: +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f32_to_v6bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB25_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_4 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_3: +; VI-NEXT: s_branch .LBB25_2 +; VI-NEXT: .LBB25_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f32_to_v6bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_4 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_3: +; GFX9-NEXT: s_branch .LBB25_2 +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f32_to_v6bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: s_branch .LBB25_2 +; GFX11-NEXT: .LBB25_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <3 x float> %a1 to <6 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <3 x float> %a to <6 x bfloat> + br label %end + +end: + %phi = phi <6 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x bfloat> %phi +} + define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v6bf16_to_v3f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_4 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB13_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: .LBB13_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v3, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6bf16_to_v3f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_4 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB26_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v9, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v7, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: .LBB26_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6bf16_to_v3f32: ; VI: ; %bb.0: @@ -2705,7 +5219,7 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_cbranch_execz .LBB26_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -2762,7 +5276,7 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2773,7 +5287,7 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -2822,7 +5336,7 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX9-NEXT: v_perm_b32 v0, v3, v0, s7 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2834,7 +5348,7 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2892,7 +5406,7 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 -; GFX11-TRUE16-NEXT: .LBB13_2: ; %end +; GFX11-TRUE16-NEXT: .LBB26_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2904,7 +5418,7 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2957,7 +5471,7 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB13_2: ; %end +; GFX11-FAKE16-NEXT: .LBB26_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2977,59 +5491,352 @@ end: ret <3 x float> %phi } +define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6bf16_to_v3f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v6bf16_to_v3f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB27_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_4 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_3: +; VI-NEXT: s_branch .LBB27_2 +; VI-NEXT: .LBB27_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6bf16_to_v3f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_4 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v1 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v0 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_3: +; GFX9-NEXT: s_branch .LBB27_2 +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6bf16_to_v3f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_4 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s2 +; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s2 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_and_b32 v1, 0xffff, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v9 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v2, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v4, 16, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: s_branch .LBB27_2 +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <6 x bfloat> %a1 to <3 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x bfloat> %a to <3 x float> + br label %end + +end: + %phi = phi <3 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x float> %phi +} + define <6 x half> @bitcast_v3f32_to_v6f16(<3 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f32_to_v6f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v7, v1 -; GCN-NEXT: v_mov_b32_e32 v6, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB14_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB14_4 -; GCN-NEXT: .LBB14_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB14_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: .LBB14_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f32_to_v6f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB28_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB28_4 +; SI-NEXT: .LBB28_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB28_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: .LBB28_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v6f16: ; VI: ; %bb.0: @@ -3092,69 +5899,189 @@ end: ret <6 x half> %phi } +define inreg <6 x half> @bitcast_v3f32_to_v6f16_scalar(<3 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f32_to_v6f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_v3f32_to_v6f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f32_to_v6f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f32_to_v6f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <3 x float> %a1 to <6 x half> + br label %end + +cmp.false: + %a3 = bitcast <3 x float> %a to <6 x half> + br label %end + +end: + %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x half> %phi +} + define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f16_to_v3f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_4 -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB15_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_or_b32_e32 v0, v7, v0 -; GCN-NEXT: v_or_b32_e32 v1, v6, v1 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: .LBB15_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v4, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v5 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f16_to_v3f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_4 +; SI-NEXT: .LBB30_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB30_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: .LBB30_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f16_to_v3f32: ; VI: ; %bb.0: @@ -3163,7 +6090,7 @@ define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 0x200 ; VI-NEXT: v_add_f16_sdwa v4, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3175,7 +6102,7 @@ define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v3 -; VI-NEXT: .LBB15_2: ; %end +; VI-NEXT: .LBB30_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3227,40 +6154,182 @@ end: ret <3 x float> %phi } +define inreg <3 x float> @bitcast_v6f16_to_v3f32_scalar(<6 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f16_to_v3f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v6f16_to_v3f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB31_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_4 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v3, v1 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_sdwa v3, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_3: +; VI-NEXT: s_branch .LBB31_2 +; VI-NEXT: .LBB31_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f16_to_v3f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_4 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_3: +; GFX9-NEXT: s_branch .LBB31_2 +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f16_to_v3f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_3: +; GFX11-NEXT: s_branch .LBB31_2 +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x half> %a, splat (half 0xH0200) + %a2 = bitcast <6 x half> %a1 to <3 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x half> %a to <3 x float> + br label %end + +end: + %phi = phi <3 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x float> %phi +} + define <6 x i16> @bitcast_v3f32_to_v6i16(<3 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f32_to_v6i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_4 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB16_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v5, s4, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: .LBB16_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v5, s4, v4, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f32_to_v6i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_4 +; SI-NEXT: .LBB32_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB32_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: .LBB32_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v6i16: ; VI: ; %bb.0: @@ -3323,56 +6392,168 @@ end: ret <6 x i16> %phi } +define inreg <6 x i16> @bitcast_v3f32_to_v6i16_scalar(<3 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f32_to_v6i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB33_4 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f32_to_v6i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_3: +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f32_to_v6i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f32_to_v6i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <3 x float> %a1 to <6 x i16> + br label %end + +cmp.false: + %a3 = bitcast <3 x float> %a to <6 x i16> + br label %end + +end: + %phi = phi <6 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i16> %phi +} + define <3 x float> @bitcast_v6i16_to_v3f32(<6 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i16_to_v3f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v7, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB17_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB17_4 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB17_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v6 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: .LBB17_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v6, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i16_to_v3f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB34_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB34_4 +; SI-NEXT: .LBB34_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB34_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: .LBB34_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i16_to_v3f32: ; VI: ; %bb.0: @@ -3381,7 +6562,7 @@ define <3 x float> @bitcast_v6i16_to_v3f32(<6 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 3 ; VI-NEXT: v_add_u16_e32 v3, 3, v2 @@ -3393,7 +6574,7 @@ define <3 x float> @bitcast_v6i16_to_v3f32(<6 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v3, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3444,111 +6625,241 @@ end: ret <3 x float> %phi } +define inreg <3 x float> @bitcast_v6i16_to_v3f32_scalar(<6 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i16_to_v3f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v6i16_to_v3f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v6i16_to_v3f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: +; GFX9-NEXT: s_branch .LBB35_2 +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i16_to_v3f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i16> %a, splat (i16 3) + %a2 = bitcast <6 x i16> %a1 to <3 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x i16> %a to <3 x float> + br label %end + +end: + %phi = phi <3 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x float> %phi +} + define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i8_to_v6bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v11 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v3, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GCN-NEXT: v_or_b32_e32 v11, v1, v0 -; GCN-NEXT: v_or_b32_e32 v13, v14, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v15, v4 -; GCN-NEXT: v_or_b32_e32 v12, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v16, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB18_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v9 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v10 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GCN-NEXT: v_or_b32_e32 v4, v17, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v3 -; GCN-NEXT: v_or_b32_e32 v3, v16, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v15, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_or_b32_e32 v1, v14, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; GCN-NEXT: .LBB18_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v11 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v7 -; GCN-NEXT: v_mov_b32_e32 v4, v12 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i8_to_v6bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v1 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v11 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v14 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v17 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v7, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; SI-NEXT: v_or_b32_e32 v11, v4, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v5, v16, v2 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB36_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v8 +; SI-NEXT: .LBB36_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v12 +; SI-NEXT: v_mov_b32_e32 v4, v11 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i8_to_v6bf16: ; VI: ; %bb.0: @@ -3565,14 +6876,14 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB18_3 +; VI-NEXT: s_cbranch_execnz .LBB36_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB18_4 -; VI-NEXT: .LBB18_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB36_4 +; VI-NEXT: .LBB36_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB18_3: ; %cmp.false +; VI-NEXT: .LBB36_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3595,8 +6906,8 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 -; VI-NEXT: .LBB18_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: .LBB36_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v13 ; VI-NEXT: v_add_u16_e32 v1, 3, v14 ; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3637,14 +6948,14 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB18_3 +; GFX9-NEXT: s_cbranch_execnz .LBB36_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB18_4 -; GFX9-NEXT: .LBB18_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB36_4 +; GFX9-NEXT: .LBB36_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB18_3: ; %cmp.false +; GFX9-NEXT: .LBB36_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3667,8 +6978,8 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 -; GFX9-NEXT: .LBB18_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: .LBB36_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v13 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v14 ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3713,14 +7024,14 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_4 -; GFX11-TRUE16-NEXT: .LBB18_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_4 +; GFX11-TRUE16-NEXT: .LBB36_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB18_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB36_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h @@ -3758,8 +7069,8 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 -; GFX11-TRUE16-NEXT: .LBB18_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-TRUE16-NEXT: .LBB36_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 @@ -3813,14 +7124,14 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_4 -; GFX11-FAKE16-NEXT: .LBB18_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_4 +; GFX11-FAKE16-NEXT: .LBB36_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB18_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB36_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -3856,8 +7167,8 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 -; GFX11-FAKE16-NEXT: .LBB18_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-FAKE16-NEXT: .LBB36_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v13, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v14, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -3911,88 +7222,449 @@ end: ret <6 x bfloat> %phi } +define inreg <6 x bfloat> @bitcast_v12i8_to_v6bf16_scalar(<12 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i8_to_v6bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s17, 24 +; SI-NEXT: s_or_b32 s6, s5, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_or_b32 s9, s5, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s25, 24 +; SI-NEXT: s_or_b32 s10, s5, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_or_b32 s11, s5, s4 +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s26, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s7, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s18, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s9, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s5, 16 +; SI-NEXT: s_and_b32 s11, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s4, 16 +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: s_branch .LBB37_2 +; +; VI-LABEL: bitcast_v12i8_to_v6bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s8, s17, 8 +; VI-NEXT: s_and_b32 s9, s16, 0xff +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s22, 0xff +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s7, s27, 24 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: .LBB37_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; VI-NEXT: s_branch .LBB37_2 +; +; GFX9-LABEL: bitcast_v12i8_to_v6bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: .LBB37_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB37_2 +; +; GFX11-LABEL: bitcast_v12i8_to_v6bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_lshl_b32 s6, s17, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s5, s16, 0xff +; GFX11-NEXT: s_and_b32 s7, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s9 +; GFX11-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-NEXT: s_and_b32 s10, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s23, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s9 +; GFX11-NEXT: s_or_b32 s9, s10, s11 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s9 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_3 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s19, 8 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-NEXT: s_and_b32 s5, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s23, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s6, s5 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_or_b32 s6, s2, s3 +; GFX11-NEXT: .LBB37_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB37_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i8> %a, splat (i8 3) + %a2 = bitcast <12 x i8> %a1 to <6 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <12 x i8> %a to <6 x bfloat> + br label %end + +end: + %phi = phi <6 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x bfloat> %phi +} + define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v6bf16_to_v12i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_4 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB19_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v13 -; GCN-NEXT: v_alignbit_b32 v0, v0, v15, 16 -; GCN-NEXT: v_alignbit_b32 v4, v6, v14, 16 -; GCN-NEXT: v_alignbit_b32 v8, v10, v12, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: .LBB19_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v4, v6, v2, 16 -; GCN-NEXT: v_alignbit_b32 v8, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6bf16_to_v12i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB38_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB38_4 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB38_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_alignbit_b32 v0, v0, v17, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v15, 16 +; SI-NEXT: v_alignbit_b32 v8, v10, v13, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: .LBB38_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_alignbit_b32 v8, v10, v1, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6bf16_to_v12i8: ; VI: ; %bb.0: @@ -4012,7 +7684,7 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v13 @@ -4023,9 +7695,9 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[13:14] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[15:16] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; VI-NEXT: .LBB19_2: ; %Flow +; VI-NEXT: .LBB38_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_4 +; VI-NEXT: s_cbranch_execz .LBB38_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -4092,7 +7764,7 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; VI-NEXT: .LBB19_4: ; %end +; VI-NEXT: .LBB38_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v15 ; VI-NEXT: v_mov_b32_e32 v4, v16 @@ -4117,7 +7789,7 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -4128,9 +7800,9 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: .LBB19_2: ; %Flow +; GFX9-NEXT: .LBB38_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_4 +; GFX9-NEXT: s_cbranch_execz .LBB38_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -4192,7 +7864,7 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX9-NEXT: .LBB19_4: ; %end +; GFX9-NEXT: .LBB38_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v13 ; GFX9-NEXT: v_mov_b32_e32 v4, v14 @@ -4216,7 +7888,7 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v3 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v13 @@ -4228,9 +7900,9 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v13.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v13.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l -; GFX11-TRUE16-NEXT: .LBB19_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB38_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB19_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v12 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v13 @@ -4295,7 +7967,7 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v13 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 -; GFX11-TRUE16-NEXT: .LBB19_4: ; %end +; GFX11-TRUE16-NEXT: .LBB38_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.h @@ -4325,7 +7997,7 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -4336,9 +8008,9 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX11-FAKE16-NEXT: .LBB19_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB38_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB19_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB38_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v14 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v8 @@ -4384,30 +8056,588 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v1, v1, v5, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v3, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v1, v1, v5, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v3, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v1, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v6, v7, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v4, v5, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-FAKE16-NEXT: .LBB38_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v14 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <6 x bfloat> %a1 to <12 x i8> + br label %end + +cmp.false: + %a3 = bitcast <6 x bfloat> %a to <12 x i8> + br label %end + +end: + %phi = phi <12 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i8> %phi +} + +define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6bf16_to_v12i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_alignbit_b32 v0, v0, v17, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v15, 16 +; SI-NEXT: v_alignbit_b32 v8, v10, v13, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_alignbit_b32 v8, v10, v1, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v11 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v6bf16_to_v12i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB39_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s19, s16, 8 +; VI-NEXT: s_lshr_b32 s10, s18, 16 +; VI-NEXT: s_lshr_b32 s11, s18, 8 +; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s13, s17, 16 +; VI-NEXT: s_lshr_b32 s15, s17, 8 +; VI-NEXT: s_lshr_b32 s14, s16, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB39_4 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v8, v0, v1, 16 +; VI-NEXT: v_mov_b32_e32 v9, 0x7fc07fc0 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[14:15] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v15 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v14 +; VI-NEXT: s_branch .LBB39_5 +; VI-NEXT: .LBB39_3: +; VI-NEXT: ; implicit-def: $sgpr19 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB39_2 +; VI-NEXT: .LBB39_4: +; VI-NEXT: v_mov_b32_e32 v14, s16 +; VI-NEXT: v_mov_b32_e32 v15, s17 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v1, s19 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v5, s15 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v13, s11 +; VI-NEXT: v_mov_b32_e32 v10, s10 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB39_5: ; %end +; VI-NEXT: v_mov_b32_e32 v0, v14 +; VI-NEXT: v_mov_b32_e32 v4, v15 +; VI-NEXT: v_mov_b32_e32 v9, v13 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6bf16_to_v12i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s19, s17, 16 +; GFX9-NEXT: s_lshr_b32 s15, s18, 16 +; GFX9-NEXT: s_lshr_b32 s14, s18, 8 +; GFX9-NEXT: s_lshr_b32 s10, s17, 24 +; GFX9-NEXT: s_lshr_b32 s11, s17, 8 +; GFX9-NEXT: s_lshr_b32 s13, s16, 16 +; GFX9-NEXT: s_lshr_b32 s12, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB39_4 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v3 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v3 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v11, v4, 16, v3 +; GFX9-NEXT: v_mov_b32_e32 v12, 0x7fc07fc0 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, v13 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_3: +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr19 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB39_2 +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v6, s19 +; GFX9-NEXT: v_mov_b32_e32 v10, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v12i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-TRUE16-NEXT: .LBB39_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s1, 0, s2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, 0x7fc07fc0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v10, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v2, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v11 :: v_dual_add_nc_u32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v0, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v6, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v5, 16, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v9, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-TRUE16-NEXT: s_branch .LBB39_5 +; GFX11-TRUE16-NEXT: .LBB39_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB39_2 +; GFX11-TRUE16-NEXT: .LBB39_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s4 +; GFX11-TRUE16-NEXT: .LBB39_5: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v12i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-FAKE16-NEXT: .LBB39_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, 0x7fc07fc0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v8, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v1, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v6, v7, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v2, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v10, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v9 :: v_dual_add_nc_u32 v3, v3, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v4, v5, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v7, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX11-FAKE16-NEXT: .LBB19_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v13 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v13 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB39_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB39_2 +; GFX11-FAKE16-NEXT: .LBB39_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v11, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4427,97 +8657,97 @@ end: } define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i8_to_v6f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v13, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v11 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v12 -; GCN-NEXT: v_or_b32_e32 v1, v1, v14 -; GCN-NEXT: v_or_b32_e32 v2, v2, v15 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v16 -; GCN-NEXT: v_or_b32_e32 v5, v5, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v10 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v13 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v0, v17, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v15, v4 -; GCN-NEXT: v_or_b32_e32 v2, v14, v2 -; GCN-NEXT: v_or_b32_e32 v5, v12, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x300, v0 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v11 -; GCN-NEXT: v_mov_b32_e32 v4, v9 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i8_to_v6f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v4, v4, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v4, v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v9 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i8_to_v6f16: ; VI: ; %bb.0: @@ -4534,14 +8764,14 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB20_3 +; VI-NEXT: s_cbranch_execnz .LBB40_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB20_4 -; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB40_4 +; VI-NEXT: .LBB40_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB20_3: ; %cmp.false +; VI-NEXT: .LBB40_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4564,8 +8794,8 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 -; VI-NEXT: .LBB20_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: .LBB40_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v13 ; VI-NEXT: v_add_u16_e32 v1, 3, v14 ; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -4606,14 +8836,14 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB20_3 +; GFX9-NEXT: s_cbranch_execnz .LBB40_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB20_4 -; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB40_4 +; GFX9-NEXT: .LBB40_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB20_3: ; %cmp.false +; GFX9-NEXT: .LBB40_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4636,8 +8866,8 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 -; GFX9-NEXT: .LBB20_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: .LBB40_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v13 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v14 ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -4682,14 +8912,14 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB20_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB20_4 -; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_4 +; GFX11-TRUE16-NEXT: .LBB40_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB20_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB40_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h @@ -4727,8 +8957,8 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-TRUE16-NEXT: .LBB20_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-TRUE16-NEXT: .LBB40_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 @@ -4782,14 +9012,14 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_4 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_4 +; GFX11-FAKE16-NEXT: .LBB40_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB20_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB40_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -4825,8 +9055,8 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-FAKE16-NEXT: .LBB40_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v13, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v14, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -4880,90 +9110,437 @@ end: ret <6 x half> %phi } +define inreg <6 x half> @bitcast_v12i8_to_v6f16_scalar(<12 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i8_to_v6f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s23, 8 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s20, 0xff +; SI-NEXT: s_lshl_b32 s8, s21, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s18, 0xff +; SI-NEXT: s_lshl_b32 s9, s19, 8 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s16, 0xff +; SI-NEXT: s_lshl_b32 s10, s17, 8 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v12i8_to_v6f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s8, s17, 8 +; VI-NEXT: s_and_b32 s9, s16, 0xff +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s22, 0xff +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s7, s27, 24 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v12i8_to_v6f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-LABEL: bitcast_v12i8_to_v6f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_lshl_b32 s6, s17, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s5, s16, 0xff +; GFX11-NEXT: s_and_b32 s7, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s9 +; GFX11-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-NEXT: s_and_b32 s10, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s23, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s9 +; GFX11-NEXT: s_or_b32 s9, s10, s11 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s9 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s19, 8 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-NEXT: s_and_b32 s5, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s23, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s6, s5 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_or_b32 s6, s2, s3 +; GFX11-NEXT: .LBB41_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB41_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i8> %a, splat (i8 3) + %a2 = bitcast <12 x i8> %a1 to <6 x half> + br label %end + +cmp.false: + %a3 = bitcast <12 x i8> %a to <6 x half> + br label %end + +end: + %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x half> %phi +} + define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f16_to_v12i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_4 -; GCN-NEXT: .LBB21_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB21_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 -; GCN-NEXT: v_or_b32_e32 v0, v14, v0 -; GCN-NEXT: v_or_b32_e32 v4, v13, v1 -; GCN-NEXT: v_or_b32_e32 v8, v12, v2 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; GCN-NEXT: v_bfe_u32 v11, v10, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: .LBB21_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v12 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v4, v2, v4 -; GCN-NEXT: v_or_b32_e32 v8, v3, v5 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; GCN-NEXT: v_bfe_u32 v11, v10, 8, 8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f16_to_v12i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_4 +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB42_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v14, v0 +; SI-NEXT: v_or_b32_e32 v4, v13, v1 +; SI-NEXT: v_or_b32_e32 v8, v12, v7 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v11, v10, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: .LBB42_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v1, v2 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v11, v10, 8, 8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f16_to_v12i8: ; VI: ; %bb.0: @@ -4992,7 +9569,7 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 ; VI-NEXT: ; %bb.2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_4 +; VI-NEXT: s_cbranch_execz .LBB42_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 0x200 ; VI-NEXT: v_add_f16_sdwa v6, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -5014,7 +9591,7 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; VI-NEXT: .LBB21_4: ; %end +; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v13 ; VI-NEXT: v_mov_b32_e32 v4, v14 @@ -5038,7 +9615,7 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v13 @@ -5049,9 +9626,9 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[15:16] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: .LBB21_2: ; %Flow +; GFX9-NEXT: .LBB42_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] @@ -5067,7 +9644,7 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: .LBB21_4: ; %end +; GFX9-NEXT: .LBB42_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v15 ; GFX9-NEXT: v_mov_b32_e32 v4, v16 @@ -5098,7 +9675,7 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v12 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] @@ -5112,7 +9689,7 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v13 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v12 -; GFX11-TRUE16-NEXT: .LBB21_4: ; %end +; GFX11-TRUE16-NEXT: .LBB42_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.h @@ -5141,7 +9718,7 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v13 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v13 @@ -5152,9 +9729,9 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[13:14] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; GFX11-FAKE16-NEXT: .LBB21_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB42_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] @@ -5171,7 +9748,7 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v16 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v15 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; GFX11-FAKE16-NEXT: .LBB21_4: ; %end +; GFX11-FAKE16-NEXT: .LBB42_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v15 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v16 @@ -5194,115 +9771,462 @@ end: ret <12 x i8> %phi } +define inreg <12 x i8> @bitcast_v6f16_to_v12i8_scalar(<6 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f16_to_v12i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v14, v0 +; SI-NEXT: v_or_b32_e32 v4, v13, v1 +; SI-NEXT: v_or_b32_e32 v8, v12, v7 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v11, v10, 8, 8 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v1, v2 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v11, v10, 8, 8 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v6f16_to_v12i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB43_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s19, s16, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 16 +; VI-NEXT: s_lshr_b32 s11, s18, 8 +; VI-NEXT: s_lshr_b32 s13, s17, 24 +; VI-NEXT: s_lshr_b32 s15, s17, 16 +; VI-NEXT: s_lshr_b32 s10, s17, 8 +; VI-NEXT: s_lshr_b32 s12, s16, 8 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB43_4 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x200 +; VI-NEXT: v_add_f16_e32 v6, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; VI-NEXT: v_add_f16_e32 v13, s17, v1 +; VI-NEXT: v_add_f16_e32 v2, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v12, v13, v0 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f16_e32 v0, s16, v1 +; VI-NEXT: v_add_f16_e32 v10, s4, v1 +; VI-NEXT: v_or_b32_e32 v11, v0, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; VI-NEXT: v_add_f16_e32 v8, s18, v1 +; VI-NEXT: v_or_b32_e32 v14, v8, v3 +; VI-NEXT: v_mov_b32_e32 v15, 0x7e007e00 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[14:15] +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v14 +; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; VI-NEXT: v_mov_b32_e32 v4, v13 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_3: +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr19 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB43_2 +; VI-NEXT: .LBB43_4: +; VI-NEXT: v_mov_b32_e32 v2, s19 +; VI-NEXT: v_mov_b32_e32 v6, s15 +; VI-NEXT: v_mov_b32_e32 v10, s14 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v9, s11 +; VI-NEXT: v_mov_b32_e32 v1, s12 +; VI-NEXT: v_mov_b32_e32 v5, s10 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f16_to_v12i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s19, s16, 8 +; GFX9-NEXT: s_lshr_b32 s10, s18, 16 +; GFX9-NEXT: s_lshr_b32 s11, s18, 8 +; GFX9-NEXT: s_lshr_b32 s12, s17, 24 +; GFX9-NEXT: s_lshr_b32 s13, s17, 16 +; GFX9-NEXT: s_lshr_b32 s15, s17, 8 +; GFX9-NEXT: s_lshr_b32 s14, s16, 16 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB43_4 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v9, 0x7e007e00 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[14:15] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v14 +; GFX9-NEXT: s_branch .LBB43_5 +; GFX9-NEXT: .LBB43_3: +; GFX9-NEXT: ; implicit-def: $sgpr19 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB43_2 +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: v_mov_b32_e32 v14, s16 +; GFX9-NEXT: v_mov_b32_e32 v15, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v13, s11 +; GFX9-NEXT: v_mov_b32_e32 v10, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB43_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v14 +; GFX9-NEXT: v_mov_b32_e32 v4, v15 +; GFX9-NEXT: v_mov_b32_e32 v9, v13 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v6f16_to_v12i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, 0x7e007e00 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB43_5 +; GFX11-TRUE16-NEXT: .LBB43_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: .LBB43_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v6f16_to_v12i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, 0x7e007e00 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[14:15] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v14 +; GFX11-FAKE16-NEXT: s_branch .LBB43_5 +; GFX11-FAKE16-NEXT: .LBB43_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v1, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s9 :: v_dual_mov_b32 v10, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB43_5: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v14 :: v_dual_mov_b32 v9, v13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v15 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x half> %a, splat (half 0xH0200) + %a2 = bitcast <6 x half> %a1 to <12 x i8> + br label %end + +cmp.false: + %a3 = bitcast <6 x half> %a to <12 x i8> + br label %end + +end: + %phi = phi <12 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i8> %phi +} + define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i8_to_v6i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v4 -; GCN-NEXT: v_mov_b32_e32 v15, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_4 -; GCN-NEXT: .LBB22_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB22_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v3, v18 -; GCN-NEXT: v_or_b32_e32 v4, v4, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v7, v1 -; GCN-NEXT: v_or_b32_e32 v6, v12, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_or_b32_e32 v2, v0, v1 -; GCN-NEXT: v_or_b32_e32 v0, v3, v6 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v2, v6, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: .LBB22_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v15 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v0, v18, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v17, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v12, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x300, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v3 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i8_to_v6i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v4 +; SI-NEXT: v_mov_b32_e32 v15, v2 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v11 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_4 +; SI-NEXT: .LBB44_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB44_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v12, v1 +; SI-NEXT: v_or_b32_e32 v2, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: .LBB44_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v15 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i8_to_v6i16: ; VI: ; %bb.0: @@ -5319,14 +10243,14 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB22_3 +; VI-NEXT: s_cbranch_execnz .LBB44_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB22_4 -; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB44_4 +; VI-NEXT: .LBB44_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB22_3: ; %cmp.false +; VI-NEXT: .LBB44_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -5349,8 +10273,8 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 -; VI-NEXT: .LBB22_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB44_2 +; VI-NEXT: .LBB44_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v13 ; VI-NEXT: v_add_u16_e32 v1, 3, v14 ; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -5391,14 +10315,14 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB22_3 +; GFX9-NEXT: s_cbranch_execnz .LBB44_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB22_4 -; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB44_4 +; GFX9-NEXT: .LBB44_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB22_3: ; %cmp.false +; GFX9-NEXT: .LBB44_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -5421,8 +10345,8 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 -; GFX9-NEXT: .LBB22_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB44_2 +; GFX9-NEXT: .LBB44_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v13 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v14 ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -5467,14 +10391,14 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB22_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB22_4 -; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_4 +; GFX11-TRUE16-NEXT: .LBB44_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB44_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h @@ -5512,8 +10436,8 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 -; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-TRUE16-NEXT: .LBB44_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 @@ -5567,14 +10491,14 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB22_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB44_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB22_4 -; GFX11-FAKE16-NEXT: .LBB22_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB44_4 +; GFX11-FAKE16-NEXT: .LBB44_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB22_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB44_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -5610,8 +10534,8 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 -; GFX11-FAKE16-NEXT: .LBB22_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-FAKE16-NEXT: .LBB44_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v13, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v14, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -5665,89 +10589,454 @@ end: ret <6 x i16> %phi } +define inreg <6 x i16> @bitcast_v12i8_to_v6i16_scalar(<12 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i8_to_v6i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_or_b32 s4, s6, s4 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s8, s17, 8 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_or_b32 s6, s6, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s8, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s8 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s27, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s10, s9, s8 +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 16 +; SI-NEXT: s_or_b32 s8, s4, s10 +; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_lshr_b32 s10, s10, 16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s7, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s8, s4, 0x3000000 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 16 +; SI-NEXT: s_lshr_b32 s9, s7, 16 +; SI-NEXT: s_lshr_b32 s10, s8, 16 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v12i8_to_v6i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s8, s17, 8 +; VI-NEXT: s_and_b32 s9, s16, 0xff +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s4, s19, 24 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s22, 0xff +; VI-NEXT: s_lshl_b32 s5, s23, 24 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s8, s26, 0xff +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_lshl_b32 s7, s27, 24 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v12i8_to_v6i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-LABEL: bitcast_v12i8_to_v6i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_lshl_b32 s6, s17, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s5, s16, 0xff +; GFX11-NEXT: s_and_b32 s7, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s9 +; GFX11-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-NEXT: s_and_b32 s10, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s23, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s9 +; GFX11-NEXT: s_or_b32 s9, s10, s11 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s9 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s19, 8 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-NEXT: s_and_b32 s5, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s23, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s6, s5 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_or_b32 s6, s2, s3 +; GFX11-NEXT: .LBB45_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB45_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i8> %a, splat (i8 3) + %a2 = bitcast <12 x i8> %a1 to <6 x i16> + br label %end + +cmp.false: + %a3 = bitcast <12 x i8> %a to <6 x i16> + br label %end + +end: + %phi = phi <6 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i16> %phi +} + define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i16_to_v12i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v5 -; GCN-NEXT: v_mov_b32_e32 v12, v4 -; GCN-NEXT: v_mov_b32_e32 v16, v3 -; GCN-NEXT: v_mov_b32_e32 v13, v2 -; GCN-NEXT: v_mov_b32_e32 v14, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v15 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_4 -; GCN-NEXT: .LBB23_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB23_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v15 -; GCN-NEXT: v_bfe_u32 v7, v16, 8, 8 -; GCN-NEXT: v_or_b32_e32 v0, v0, v17 -; GCN-NEXT: v_or_b32_e32 v4, v1, v18 -; GCN-NEXT: v_or_b32_e32 v8, v2, v19 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; GCN-NEXT: v_bfe_u32 v11, v15, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: .LBB23_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v14 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v12 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v17, v0 -; GCN-NEXT: v_or_b32_e32 v1, v18, v1 -; GCN-NEXT: v_or_b32_e32 v2, v19, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v2 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i16_to_v12i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v15, v5 +; SI-NEXT: v_mov_b32_e32 v16, v3 +; SI-NEXT: v_mov_b32_e32 v12, v4 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_4 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB46_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v19 +; SI-NEXT: v_or_b32_e32 v4, v1, v18 +; SI-NEXT: v_or_b32_e32 v8, v6, v17 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; SI-NEXT: v_bfe_u32 v7, v16, 8, 8 +; SI-NEXT: v_bfe_u32 v11, v15, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: .LBB46_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i16_to_v12i8: ; VI: ; %bb.0: @@ -5767,7 +11056,7 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 @@ -5780,9 +11069,9 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v8, v2 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: ; implicit-def: $vgpr2 -; VI-NEXT: .LBB23_2: ; %Flow +; VI-NEXT: .LBB46_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_4 +; VI-NEXT: s_cbranch_execz .LBB46_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 3 ; VI-NEXT: v_add_u16_sdwa v6, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -5803,7 +11092,7 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v0 ; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; VI-NEXT: .LBB23_4: ; %end +; VI-NEXT: .LBB46_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v16 ; VI-NEXT: v_mov_b32_e32 v1, v15 @@ -5829,7 +11118,7 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -5840,9 +11129,9 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: .LBB23_2: ; %Flow +; GFX9-NEXT: .LBB46_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -5856,7 +11145,7 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: .LBB23_4: ; %end +; GFX9-NEXT: .LBB46_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v13 ; GFX9-NEXT: v_mov_b32_e32 v4, v14 @@ -5886,7 +11175,7 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v11 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -5899,7 +11188,7 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v11 -; GFX11-TRUE16-NEXT: .LBB23_4: ; %end +; GFX11-TRUE16-NEXT: .LBB46_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h @@ -5928,7 +11217,7 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -5939,9 +11228,9 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX11-FAKE16-NEXT: .LBB23_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB46_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -5956,7 +11245,7 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v14 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v13 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX11-FAKE16-NEXT: .LBB23_4: ; %end +; GFX11-FAKE16-NEXT: .LBB46_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v13 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v14 @@ -5978,80 +11267,422 @@ end: ret <12 x i8> %phi } +define inreg <12 x i8> @bitcast_v6i16_to_v12i8_scalar(<6 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i16_to_v12i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: v_alignbit_b32 v3, s7, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s7, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 8 +; SI-NEXT: s_lshr_b32 s9, s7, 8 +; SI-NEXT: s_lshr_b32 s12, s8, 8 +; SI-NEXT: s_and_b32 s10, s19, 0xffff +; SI-NEXT: s_and_b32 s13, s21, 0xffff +; SI-NEXT: s_bfe_u32 s11, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s14, s21, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s8, s4, 0x30000 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_alignbit_b32 v3, s7, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s7, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 8 +; SI-NEXT: s_lshr_b32 s11, s7, 24 +; SI-NEXT: s_lshr_b32 s10, s7, 16 +; SI-NEXT: s_lshr_b32 s9, s7, 8 +; SI-NEXT: s_lshr_b32 s14, s8, 24 +; SI-NEXT: s_lshr_b32 s13, s8, 16 +; SI-NEXT: s_lshr_b32 s12, s8, 8 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v10, s13 +; SI-NEXT: v_mov_b32_e32 v11, s14 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v6i16_to_v12i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB47_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s19, s16, 8 +; VI-NEXT: s_lshr_b32 s10, s18, 16 +; VI-NEXT: s_lshr_b32 s11, s18, 8 +; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s13, s17, 16 +; VI-NEXT: s_lshr_b32 s14, s17, 8 +; VI-NEXT: s_lshr_b32 s15, s16, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s16, 3 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s17, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_add_i32 s16, s6, 0x30000 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s17, s8, 0x30000 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_lshr_b32 s19, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s10, s18, 16 +; VI-NEXT: s_lshr_b32 s11, s18, 8 +; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s13, s17, 16 +; VI-NEXT: s_lshr_b32 s14, s17, 8 +; VI-NEXT: s_lshr_b32 s15, s16, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s19 +; VI-NEXT: v_mov_b32_e32 v2, s15 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s11 +; VI-NEXT: v_mov_b32_e32 v10, s10 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: ; implicit-def: $sgpr19 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v6i16_to_v12i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s19, s16, 8 +; GFX9-NEXT: s_lshr_b32 s10, s18, 16 +; GFX9-NEXT: s_lshr_b32 s11, s18, 8 +; GFX9-NEXT: s_lshr_b32 s12, s17, 24 +; GFX9-NEXT: s_lshr_b32 s13, s17, 16 +; GFX9-NEXT: s_lshr_b32 s15, s17, 8 +; GFX9-NEXT: s_lshr_b32 s14, s16, 16 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v8, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: s_branch .LBB47_5 +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: ; implicit-def: $sgpr19 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: v_mov_b32_e32 v14, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: v_mov_b32_e32 v10, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB47_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v13 +; GFX9-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v6i16_to_v12i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[8:9] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB47_5 +; GFX11-TRUE16-NEXT: .LBB47_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s4 +; GFX11-TRUE16-NEXT: .LBB47_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v6i16_to_v12i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX11-FAKE16-NEXT: s_branch .LBB47_5 +; GFX11-FAKE16-NEXT: .LBB47_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v1, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v10, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB47_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v14 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i16> %a, splat (i16 3) + %a2 = bitcast <6 x i16> %a1 to <12 x i8> + br label %end + +cmp.false: + %a3 = bitcast <6 x i16> %a to <12 x i8> + br label %end + +end: + %phi = phi <12 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i8> %phi +} + define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v6bf16_to_v6f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v5 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB24_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB24_4 -; GCN-NEXT: .LBB24_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB24_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: .LBB24_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6bf16_to_v6f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB48_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB48_4 +; SI-NEXT: .LBB48_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB48_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: .LBB48_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6bf16_to_v6f16: ; VI: ; %bb.0: @@ -6060,7 +11691,7 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -6117,7 +11748,7 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v2, v2, v5, 16 ; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: .LBB48_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6128,7 +11759,7 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -6177,7 +11808,7 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v5, v2, s6 ; GFX9-NEXT: v_perm_b32 v1, v4, v1, s6 ; GFX9-NEXT: v_perm_b32 v0, v3, v0, s6 -; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: .LBB48_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6189,7 +11820,7 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -6245,7 +11876,7 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v10, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: .LBB48_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6257,7 +11888,7 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6312,7 +11943,7 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v3, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %end +; GFX11-FAKE16-NEXT: .LBB48_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6332,74 +11963,386 @@ end: ret <6 x half> %phi } +define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6bf16_to_v6f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s21 +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v6bf16_to_v6f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB49_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB49_4 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v6, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; VI-NEXT: v_bfe_u32 v6, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v2, v0, v2, 16 +; VI-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; VI-NEXT: v_alignbit_b32 v0, v4, v3, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_3: +; VI-NEXT: s_branch .LBB49_2 +; VI-NEXT: .LBB49_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6bf16_to_v6f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v2 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v2 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6bf16_to_v6f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-NEXT: .LBB49_2: ; %cmp.true +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s1 +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s2 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v2 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s3 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v8, vcc_lo +; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v9 :: v_dual_add_nc_u32 v1, v1, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v9 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v6 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_lshl_or_b32 v2, v4, 16, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB49_3: +; GFX11-NEXT: s_branch .LBB49_2 +; GFX11-NEXT: .LBB49_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <6 x bfloat> %a1 to <6 x half> + br label %end + +cmp.false: + %a3 = bitcast <6 x bfloat> %a to <6 x half> + br label %end + +end: + %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x half> %phi +} + define <6 x bfloat> @bitcast_v6f16_to_v6bf16(<6 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f16_to_v6bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v5 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_4 -; GCN-NEXT: .LBB25_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB25_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: .LBB25_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f16_to_v6bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_4 +; SI-NEXT: .LBB50_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB50_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: .LBB50_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f16_to_v6bf16: ; VI: ; %bb.0: @@ -6408,7 +12351,7 @@ define <6 x bfloat> @bitcast_v6f16_to_v6bf16(<6 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 0x200 ; VI-NEXT: v_add_f16_e32 v3, 0x200, v0 @@ -6420,7 +12363,7 @@ define <6 x bfloat> @bitcast_v6f16_to_v6bf16(<6 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v6, v2 ; VI-NEXT: v_or_b32_e32 v1, v5, v1 ; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6472,70 +12415,219 @@ end: ret <6 x bfloat> %phi } +define inreg <6 x bfloat> @bitcast_v6f16_to_v6bf16_scalar(<6 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f16_to_v6bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s21 +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v6f16_to_v6bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB51_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB51_4 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_add_f16_e32 v3, s16, v0 +; VI-NEXT: v_add_f16_sdwa v4, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v5, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v0, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v5 +; VI-NEXT: v_or_b32_e32 v0, v3, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_3: +; VI-NEXT: s_branch .LBB51_2 +; VI-NEXT: .LBB51_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f16_to_v6bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB51_4 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_3: +; GFX9-NEXT: s_branch .LBB51_2 +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f16_to_v6bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB51_4 +; GFX11-NEXT: .LBB51_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB51_3: +; GFX11-NEXT: s_branch .LBB51_2 +; GFX11-NEXT: .LBB51_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x half> %a, splat (half 0xH0200) + %a2 = bitcast <6 x half> %a1 to <6 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <6 x half> %a to <6 x bfloat> + br label %end + +end: + %phi = phi <6 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x bfloat> %phi +} + define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v6bf16_to_v6i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v5 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB26_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB26_4 -; GCN-NEXT: .LBB26_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB26_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v8 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: .LBB26_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: v_alignbit_b32 v0, v7, v0, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v6, 16 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6bf16_to_v6i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB52_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB52_4 +; SI-NEXT: .LBB52_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB52_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: .LBB52_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v4, v5, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6bf16_to_v6i16: ; VI: ; %bb.0: @@ -6544,7 +12636,7 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -6601,7 +12693,7 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v2, v2, v5, 16 ; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: .LBB52_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6612,7 +12704,7 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -6661,7 +12753,7 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v5, v2, s6 ; GFX9-NEXT: v_perm_b32 v1, v4, v1, s6 ; GFX9-NEXT: v_perm_b32 v0, v3, v0, s6 -; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: .LBB52_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6673,7 +12765,7 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -6733,7 +12825,7 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v4 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v5, 16, v3 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6745,7 +12837,7 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6800,7 +12892,7 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v3, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %end +; GFX11-FAKE16-NEXT: .LBB52_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6820,59 +12912,349 @@ end: ret <6 x i16> %phi } +define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6bf16_to_v6i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s21 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v4, v5, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v6bf16_to_v6i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB53_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_4 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v6, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; VI-NEXT: v_bfe_u32 v6, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v2, v0, v2, 16 +; VI-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; VI-NEXT: v_alignbit_b32 v0, v4, v3, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_3: +; VI-NEXT: s_branch .LBB53_2 +; VI-NEXT: .LBB53_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6bf16_to_v6i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_4 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v2 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v2 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v2, v2, v6, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX9-NEXT: v_and_or_b32 v1, v1, v6, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX9-NEXT: v_and_or_b32 v0, v3, v6, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_3: +; GFX9-NEXT: s_branch .LBB53_2 +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6bf16_to_v6i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_4 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s2 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3 +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v4, 0x7fff, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v10, v6 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_add_nc_u32 v7, v7, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v4, v5 +; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v3, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_3: +; GFX11-NEXT: s_branch .LBB53_2 +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <6 x bfloat> %a1 to <6 x i16> + br label %end + +cmp.false: + %a3 = bitcast <6 x bfloat> %a to <6 x i16> + br label %end + +end: + %phi = phi <6 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i16> %phi +} + define <6 x bfloat> @bitcast_v6i16_to_v6bf16(<6 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i16_to_v6bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v4 -; GCN-NEXT: v_mov_b32_e32 v7, v2 -; GCN-NEXT: v_mov_b32_e32 v8, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB27_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB27_4 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB27_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: .LBB27_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v5, v0 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i16_to_v6bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v4 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_4 +; SI-NEXT: .LBB54_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB54_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: .LBB54_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i16_to_v6bf16: ; VI: ; %bb.0: @@ -6881,7 +13263,7 @@ define <6 x bfloat> @bitcast_v6i16_to_v6bf16(<6 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 3 ; VI-NEXT: v_add_u16_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -6893,7 +13275,7 @@ define <6 x bfloat> @bitcast_v6i16_to_v6bf16(<6 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: v_or_b32_e32 v1, v1, v5 ; VI-NEXT: v_or_b32_e32 v0, v0, v4 -; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6944,50 +13326,192 @@ end: ret <6 x bfloat> %phi } +define inreg <6 x bfloat> @bitcast_v6i16_to_v6bf16_scalar(<6 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i16_to_v6bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_lshl_b32 s8, s18, 16 +; SI-NEXT: s_lshl_b32 s9, s19, 16 +; SI-NEXT: s_lshl_b32 s11, s20, 16 +; SI-NEXT: s_lshl_b32 s10, s21, 16 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s9, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s5, 16 +; SI-NEXT: s_and_b32 s10, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s4, 16 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s11 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v6i16_to_v6bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v6i16_to_v6bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: +; GFX9-NEXT: s_branch .LBB55_2 +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i16_to_v6bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i16> %a, splat (i16 3) + %a2 = bitcast <6 x i16> %a1 to <6 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <6 x i16> %a to <6 x bfloat> + br label %end + +end: + %phi = phi <6 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x bfloat> %phi +} + define <6 x i16> @bitcast_v6f16_to_v6i16(<6 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f16_to_v6i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v6 -; GCN-NEXT: v_or_b32_e32 v2, v2, v7 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: .LBB28_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f16_to_v6i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB56_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f16_to_v6i16: ; VI: ; %bb.0: @@ -6996,7 +13520,7 @@ define <6 x i16> @bitcast_v6f16_to_v6i16(<6 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 0x200 ; VI-NEXT: v_add_f16_e32 v3, 0x200, v0 @@ -7008,7 +13532,7 @@ define <6 x i16> @bitcast_v6f16_to_v6i16(<6 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v6, v2 ; VI-NEXT: v_or_b32_e32 v1, v5, v1 ; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7060,62 +13584,200 @@ end: ret <6 x i16> %phi } +define inreg <6 x i16> @bitcast_v6f16_to_v6i16_scalar(<6 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f16_to_v6i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v6f16_to_v6i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB57_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_4 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_add_f16_e32 v3, s16, v0 +; VI-NEXT: v_add_f16_sdwa v4, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v5, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v0, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v5 +; VI-NEXT: v_or_b32_e32 v0, v3, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_3: +; VI-NEXT: s_branch .LBB57_2 +; VI-NEXT: .LBB57_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f16_to_v6i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f16_to_v6i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_3: +; GFX11-NEXT: s_branch .LBB57_2 +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x half> %a, splat (half 0xH0200) + %a2 = bitcast <6 x half> %a1 to <6 x i16> + br label %end + +cmp.false: + %a3 = bitcast <6 x half> %a to <6 x i16> + br label %end + +end: + %phi = phi <6 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i16> %phi +} + define <6 x half> @bitcast_v6i16_to_v6f16(<6 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i16_to_v6f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v5 -; GCN-NEXT: v_mov_b32_e32 v7, v4 -; GCN-NEXT: v_mov_b32_e32 v8, v3 -; GCN-NEXT: v_mov_b32_e32 v9, v2 -; GCN-NEXT: v_mov_b32_e32 v10, v1 -; GCN-NEXT: v_mov_b32_e32 v11, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_4 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB29_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v12 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: .LBB29_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i16_to_v6f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v7, v4 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_4 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB58_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: .LBB58_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i16_to_v6f16: ; VI: ; %bb.0: @@ -7124,7 +13786,7 @@ define <6 x half> @bitcast_v6i16_to_v6f16(<6 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 3 ; VI-NEXT: v_add_u16_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -7136,7 +13798,7 @@ define <6 x half> @bitcast_v6i16_to_v6f16(<6 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: v_or_b32_e32 v1, v1, v5 ; VI-NEXT: v_or_b32_e32 v0, v0, v4 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7186,3 +13848,130 @@ end: %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <6 x half> %phi } + +define inreg <6 x half> @bitcast_v6i16_to_v6f16_scalar(<6 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i16_to_v6f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v6i16_to_v6f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB59_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_3 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB59_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_4: +; VI-NEXT: s_branch .LBB59_2 +; +; GFX9-LABEL: bitcast_v6i16_to_v6f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i16_to_v6f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_3: +; GFX11-NEXT: s_branch .LBB59_2 +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i16> %a, splat (i16 3) + %a2 = bitcast <6 x i16> %a1 to <6 x half> + br label %end + +cmp.false: + %a3 = bitcast <6 x i16> %a to <6 x half> + br label %end + +end: + %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x half> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll index 01a1e6b73ac6a..2b48cf0f41c88 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll @@ -1,25 +1,25 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_kernel void @bitcast_i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr addrspace(1) %in) { -; GCN-LABEL: bitcast_i8ptr_v16i8ptr: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NEXT: s_endpgm +; SI-LABEL: bitcast_i8ptr_v16i8ptr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm ; ; VI-LABEL: bitcast_i8ptr_v16i8ptr: ; VI: ; %bb.0: ; %entry